Codette-Reasoning / testing /test_adapters_gguf.py

Raiff1982

Upload 120 files

ed1b365 verified 1 day ago

4.72 kB

	#!/usr/bin/env python3
	"""Codette LoRA Adapter Inference Test via llama.cpp

	Uses GGUF base model + GGUF LoRA adapters for low-memory inference.
	Base: Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (~4.6 GB)
	LoRA: newton-lora-f16.gguf, davinci-lora-f16.gguf (~27 MB each)
	"""

	import os, sys, time

	os.environ["PATH"] = r"J:\Lib\site-packages\Library\bin" + os.pathsep + os.environ.get("PATH", "")
	# Fix Windows console encoding for Unicode characters (π, etc.)
	sys.stdout.reconfigure(encoding='utf-8', errors='replace')

	from llama_cpp import Llama

	BASE_GGUF = r"J:\codette-training-lab\bartowski\Meta-Llama-3.1-8B-Instruct-GGUF\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
	NEWTON_LORA = r"J:\codette-training-lab\adapters\newton-lora-f16.gguf"
	DAVINCI_LORA = r"J:\codette-training-lab\adapters\davinci-lora-f16.gguf"

	TEST_PROMPTS = [
	{
	"system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
	"user": "Explain why objects fall to the ground.",
	"tag": "physics"
	},
	{
	"system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
	"user": "What is the relationship between consciousness and the physical world?",
	"tag": "philosophy"
	},
	{
	"system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
	"user": "How would you design a system that learns from its own mistakes?",
	"tag": "systems"
	},
	]

	GEN_KWARGS = dict(
	max_tokens=200,
	temperature=0.7,
	top_p=0.9,
	stop=["<\|eot_id\|>", "<\|end_of_text\|>"],
	)


	def run_test(model_label, llm, prompts):
	"""Run all test prompts against a loaded model."""
	print(f"\n{'=' * 60}")
	print(f" {model_label}")
	print(f"{'=' * 60}")

	responses = []
	for p in prompts:
	print(f"\n [{p['tag']}] {p['user']}")
	start = time.time()
	result = llm.create_chat_completion(
	messages=[
	{"role": "system", "content": p["system"]},
	{"role": "user", "content": p["user"]},
	],
	**GEN_KWARGS,
	)
	elapsed = time.time() - start
	text = result["choices"][0]["message"]["content"].strip()
	tokens = result["usage"]["completion_tokens"]
	tps = tokens / elapsed if elapsed > 0 else 0
	print(f" Response ({elapsed:.1f}s, {tokens} tok, {tps:.1f} tok/s):")
	print(f" > {text}")
	responses.append({"tag": p["tag"], "response": text, "tokens": tokens, "time": elapsed})

	return responses


	def main():
	print("=" * 60)
	print("Codette LoRA Adapter Inference Test")
	print("=" * 60)
	print(f"Base model: {os.path.basename(BASE_GGUF)}")
	print(f"Newton LoRA: {os.path.basename(NEWTON_LORA)}")
	print(f"DaVinci LoRA: {os.path.basename(DAVINCI_LORA)}")

	all_results = {}

	# --- Test 1: BASE MODEL (no adapter) ---
	print("\nLoading BASE model (no adapter)...")
	start = time.time()
	llm_base = Llama(
	model_path=BASE_GGUF,
	n_ctx=2048,
	n_gpu_layers=0, # CPU only to save VRAM
	verbose=False,
	)
	print(f" Loaded in {time.time()-start:.1f}s")

	all_results["base"] = run_test("BASE MODEL (no adapter)", llm_base, TEST_PROMPTS)
	del llm_base

	# --- Test 2: NEWTON adapter ---
	print("\n\nLoading BASE + NEWTON adapter...")
	start = time.time()
	llm_newton = Llama(
	model_path=BASE_GGUF,
	lora_path=NEWTON_LORA,
	n_ctx=2048,
	n_gpu_layers=0,
	verbose=False,
	)
	print(f" Loaded in {time.time()-start:.1f}s")

	all_results["newton"] = run_test("NEWTON ADAPTER", llm_newton, TEST_PROMPTS)
	del llm_newton

	# --- Test 3: DAVINCI adapter ---
	print("\n\nLoading BASE + DAVINCI adapter...")
	start = time.time()
	llm_davinci = Llama(
	model_path=BASE_GGUF,
	lora_path=DAVINCI_LORA,
	n_ctx=2048,
	n_gpu_layers=0,
	verbose=False,
	)
	print(f" Loaded in {time.time()-start:.1f}s")

	all_results["davinci"] = run_test("DAVINCI ADAPTER", llm_davinci, TEST_PROMPTS)
	del llm_davinci

	# --- Summary ---
	print(f"\n{'=' * 60}")
	print("COMPARISON SUMMARY")
	print(f"{'=' * 60}")
	for tag in ["physics", "philosophy", "systems"]:
	print(f"\n--- {tag.upper()} ---")
	for model_name in ["base", "newton", "davinci"]:
	for r in all_results[model_name]:
	if r["tag"] == tag:
	short = r["response"][:120] + "..." if len(r["response"]) > 120 else r["response"]
	print(f" {model_name:8s}: {short}")

	print(f"\n{'=' * 60}")
	print("TEST COMPLETE")
	print(f"{'=' * 60}")


	if __name__ == "__main__":
	main()