Spaces:

ysingh-aiml
/

tinyllama-quantization-gguf

Sleeping

App Files Files Community

tinyllama-quantization-gguf / inference_server /server.py

ysingh-aiml

Reinitialize Space repo with latest code

659322c about 2 months ago

raw

history blame contribute delete

4.7 kB

	#!/usr/bin/env python3
	"""Launch llama.cpp `llama-server` for a quantized GGUF (local / edge only)."""

	from __future__ import annotations

	import argparse
	import os
	import shutil
	import subprocess
	import sys
	from pathlib import Path

	ROOT = Path(__file__).resolve().parent.parent
	DEFAULT_MODEL = ROOT / "models" / "model-Q4_K_M.gguf"
	DEFAULT_HF_REPO = "ysingh-aiml/tinyllama-alpaca-lora-gguf"
	QUANT_FILES = {
	"q4_k_m": "model-Q4_K_M.gguf",
	"q5_k_m": "model-Q5_K_M.gguf",
	"q8_0": "model-Q8_0.gguf",
	}


	def download_gguf(
	repo_id: str,
	filename: str,
	dest_dir: Path,
	revision: str \| None = None,
	) -> Path:
	from huggingface_hub import hf_hub_download

	dest_dir.mkdir(parents=True, exist_ok=True)
	out = hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	local_dir=str(dest_dir),
	local_dir_use_symlinks=False,
	revision=revision,
	)
	return Path(out)


	def resolve_llama_server() -> str:
	env = os.environ.get("LLAMA_SERVER", "").strip()
	if env:
	return env
	which = shutil.which("llama-server")
	return which or ""


	def main() -> None:
	parser = argparse.ArgumentParser(description="Start llama.cpp llama-server with a GGUF model.")
	parser.add_argument(
	"--quant",
	choices=sorted(QUANT_FILES.keys()),
	default=None,
	help="Pick a Hub filename under --hf-repo (sets model to models/<file>).",
	)
	parser.add_argument("--model", type=Path, default=None, help="Path to .gguf (default: models/model-Q4_K_M.gguf)")
	parser.add_argument(
	"--hf-repo",
	default=os.environ.get("TASK4_GGUF_REPO", DEFAULT_HF_REPO),
	help="Hugging Face model repo id for --fetch (default: env TASK4_GGUF_REPO or built-in)",
	)
	parser.add_argument("--revision", default=None, help="Hub git revision (branch / tag / commit) for download")
	parser.add_argument(
	"--no-fetch",
	action="store_true",
	help="Do not download from the Hub if the model file is missing",
	)
	parser.add_argument(
	"--fetch-only",
	action="store_true",
	help="Download the GGUF from the Hub then exit (no llama-server)",
	)
	parser.add_argument("--host", default="127.0.0.1")
	parser.add_argument("--port", type=int, default=8080)
	parser.add_argument("--threads", type=int, default=8)
	parser.add_argument("--ctx-size", type=int, default=2048)
	parser.add_argument(
	"--n-gpu-layers",
	type=int,
	default=0,
	help="GPU/Metal offload layer count; 0 = CPU only",
	)
	args = parser.parse_args()

	if args.quant:
	filename = QUANT_FILES[args.quant]
	model_path = (ROOT / "models" / filename).resolve()
	elif args.model is not None:
	model_path = args.model.resolve()
	else:
	model_path = DEFAULT_MODEL.resolve()

	if not model_path.is_file():
	if args.no_fetch:
	print(
	f"Model not found: {model_path}\n"
	"Remove --no-fetch to download from the Hub, or place a .gguf at this path.",
	file=sys.stderr,
	)
	raise SystemExit(1)
	print(f"Downloading {model_path.name} from {args.hf_repo} …", file=sys.stderr)
	try:
	downloaded = download_gguf(
	args.hf_repo,
	model_path.name,
	model_path.parent,
	revision=args.revision,
	)
	except Exception as e:
	print(f"Download failed: {e}", file=sys.stderr)
	raise SystemExit(1) from e
	model_path = downloaded.resolve()
	if not model_path.is_file():
	print(f"Expected file after download: {model_path}", file=sys.stderr)
	raise SystemExit(1)
	print(f"Model ready: {model_path}", file=sys.stderr)

	if args.fetch_only:
	print(model_path)
	raise SystemExit(0)

	exe = resolve_llama_server()
	if not exe:
	print(
	"llama-server not found. Build llama.cpp and set LLAMA_SERVER=/path/to/llama-server "
	"or put `llama-server` on PATH.",
	file=sys.stderr,
	)
	raise SystemExit(1)

	cmd = [
	exe,
	"-m",
	str(model_path),
	"--host",
	args.host,
	"--port",
	str(args.port),
	"--threads",
	str(args.threads),
	"--ctx-size",
	str(args.ctx_size),
	"--n-gpu-layers",
	str(args.n_gpu_layers),
	"--parallel",
	"1",
	"--no-warmup",
	]
	print("Running:", " ".join(cmd))
	raise SystemExit(subprocess.call(cmd))


	if __name__ == "__main__":
	main()