Codette-Reasoning / testing /test_adapters_gguf.py
Raiff1982's picture
Upload 120 files
ed1b365 verified
#!/usr/bin/env python3
"""Codette LoRA Adapter Inference Test via llama.cpp
Uses GGUF base model + GGUF LoRA adapters for low-memory inference.
Base: Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (~4.6 GB)
LoRA: newton-lora-f16.gguf, davinci-lora-f16.gguf (~27 MB each)
"""
import os, sys, time
os.environ["PATH"] = r"J:\Lib\site-packages\Library\bin" + os.pathsep + os.environ.get("PATH", "")
# Fix Windows console encoding for Unicode characters (π, etc.)
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from llama_cpp import Llama
BASE_GGUF = r"J:\codette-training-lab\bartowski\Meta-Llama-3.1-8B-Instruct-GGUF\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
NEWTON_LORA = r"J:\codette-training-lab\adapters\newton-lora-f16.gguf"
DAVINCI_LORA = r"J:\codette-training-lab\adapters\davinci-lora-f16.gguf"
TEST_PROMPTS = [
{
"system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
"user": "Explain why objects fall to the ground.",
"tag": "physics"
},
{
"system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
"user": "What is the relationship between consciousness and the physical world?",
"tag": "philosophy"
},
{
"system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
"user": "How would you design a system that learns from its own mistakes?",
"tag": "systems"
},
]
GEN_KWARGS = dict(
max_tokens=200,
temperature=0.7,
top_p=0.9,
stop=["<|eot_id|>", "<|end_of_text|>"],
)
def run_test(model_label, llm, prompts):
"""Run all test prompts against a loaded model."""
print(f"\n{'=' * 60}")
print(f" {model_label}")
print(f"{'=' * 60}")
responses = []
for p in prompts:
print(f"\n [{p['tag']}] {p['user']}")
start = time.time()
result = llm.create_chat_completion(
messages=[
{"role": "system", "content": p["system"]},
{"role": "user", "content": p["user"]},
],
**GEN_KWARGS,
)
elapsed = time.time() - start
text = result["choices"][0]["message"]["content"].strip()
tokens = result["usage"]["completion_tokens"]
tps = tokens / elapsed if elapsed > 0 else 0
print(f" Response ({elapsed:.1f}s, {tokens} tok, {tps:.1f} tok/s):")
print(f" > {text}")
responses.append({"tag": p["tag"], "response": text, "tokens": tokens, "time": elapsed})
return responses
def main():
print("=" * 60)
print("Codette LoRA Adapter Inference Test")
print("=" * 60)
print(f"Base model: {os.path.basename(BASE_GGUF)}")
print(f"Newton LoRA: {os.path.basename(NEWTON_LORA)}")
print(f"DaVinci LoRA: {os.path.basename(DAVINCI_LORA)}")
all_results = {}
# --- Test 1: BASE MODEL (no adapter) ---
print("\nLoading BASE model (no adapter)...")
start = time.time()
llm_base = Llama(
model_path=BASE_GGUF,
n_ctx=2048,
n_gpu_layers=0, # CPU only to save VRAM
verbose=False,
)
print(f" Loaded in {time.time()-start:.1f}s")
all_results["base"] = run_test("BASE MODEL (no adapter)", llm_base, TEST_PROMPTS)
del llm_base
# --- Test 2: NEWTON adapter ---
print("\n\nLoading BASE + NEWTON adapter...")
start = time.time()
llm_newton = Llama(
model_path=BASE_GGUF,
lora_path=NEWTON_LORA,
n_ctx=2048,
n_gpu_layers=0,
verbose=False,
)
print(f" Loaded in {time.time()-start:.1f}s")
all_results["newton"] = run_test("NEWTON ADAPTER", llm_newton, TEST_PROMPTS)
del llm_newton
# --- Test 3: DAVINCI adapter ---
print("\n\nLoading BASE + DAVINCI adapter...")
start = time.time()
llm_davinci = Llama(
model_path=BASE_GGUF,
lora_path=DAVINCI_LORA,
n_ctx=2048,
n_gpu_layers=0,
verbose=False,
)
print(f" Loaded in {time.time()-start:.1f}s")
all_results["davinci"] = run_test("DAVINCI ADAPTER", llm_davinci, TEST_PROMPTS)
del llm_davinci
# --- Summary ---
print(f"\n{'=' * 60}")
print("COMPARISON SUMMARY")
print(f"{'=' * 60}")
for tag in ["physics", "philosophy", "systems"]:
print(f"\n--- {tag.upper()} ---")
for model_name in ["base", "newton", "davinci"]:
for r in all_results[model_name]:
if r["tag"] == tag:
short = r["response"][:120] + "..." if len(r["response"]) > 120 else r["response"]
print(f" {model_name:8s}: {short}")
print(f"\n{'=' * 60}")
print("TEST COMPLETE")
print(f"{'=' * 60}")
if __name__ == "__main__":
main()