| |
| """Codette LoRA Adapter Inference Test via llama.cpp |
| |
| Uses GGUF base model + GGUF LoRA adapters for low-memory inference. |
| Base: Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (~4.6 GB) |
| LoRA: newton-lora-f16.gguf, davinci-lora-f16.gguf (~27 MB each) |
| """ |
|
|
| import os, sys, time |
|
|
| os.environ["PATH"] = r"J:\Lib\site-packages\Library\bin" + os.pathsep + os.environ.get("PATH", "") |
| |
| sys.stdout.reconfigure(encoding='utf-8', errors='replace') |
|
|
| from llama_cpp import Llama |
|
|
| BASE_GGUF = r"J:\codette-training-lab\bartowski\Meta-Llama-3.1-8B-Instruct-GGUF\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf" |
| NEWTON_LORA = r"J:\codette-training-lab\adapters\newton-lora-f16.gguf" |
| DAVINCI_LORA = r"J:\codette-training-lab\adapters\davinci-lora-f16.gguf" |
|
|
| TEST_PROMPTS = [ |
| { |
| "system": "You are a helpful assistant. Answer concisely in 2-3 sentences.", |
| "user": "Explain why objects fall to the ground.", |
| "tag": "physics" |
| }, |
| { |
| "system": "You are a helpful assistant. Answer concisely in 2-3 sentences.", |
| "user": "What is the relationship between consciousness and the physical world?", |
| "tag": "philosophy" |
| }, |
| { |
| "system": "You are a helpful assistant. Answer concisely in 2-3 sentences.", |
| "user": "How would you design a system that learns from its own mistakes?", |
| "tag": "systems" |
| }, |
| ] |
|
|
| GEN_KWARGS = dict( |
| max_tokens=200, |
| temperature=0.7, |
| top_p=0.9, |
| stop=["<|eot_id|>", "<|end_of_text|>"], |
| ) |
|
|
|
|
| def run_test(model_label, llm, prompts): |
| """Run all test prompts against a loaded model.""" |
| print(f"\n{'=' * 60}") |
| print(f" {model_label}") |
| print(f"{'=' * 60}") |
|
|
| responses = [] |
| for p in prompts: |
| print(f"\n [{p['tag']}] {p['user']}") |
| start = time.time() |
| result = llm.create_chat_completion( |
| messages=[ |
| {"role": "system", "content": p["system"]}, |
| {"role": "user", "content": p["user"]}, |
| ], |
| **GEN_KWARGS, |
| ) |
| elapsed = time.time() - start |
| text = result["choices"][0]["message"]["content"].strip() |
| tokens = result["usage"]["completion_tokens"] |
| tps = tokens / elapsed if elapsed > 0 else 0 |
| print(f" Response ({elapsed:.1f}s, {tokens} tok, {tps:.1f} tok/s):") |
| print(f" > {text}") |
| responses.append({"tag": p["tag"], "response": text, "tokens": tokens, "time": elapsed}) |
|
|
| return responses |
|
|
|
|
| def main(): |
| print("=" * 60) |
| print("Codette LoRA Adapter Inference Test") |
| print("=" * 60) |
| print(f"Base model: {os.path.basename(BASE_GGUF)}") |
| print(f"Newton LoRA: {os.path.basename(NEWTON_LORA)}") |
| print(f"DaVinci LoRA: {os.path.basename(DAVINCI_LORA)}") |
|
|
| all_results = {} |
|
|
| |
| print("\nLoading BASE model (no adapter)...") |
| start = time.time() |
| llm_base = Llama( |
| model_path=BASE_GGUF, |
| n_ctx=2048, |
| n_gpu_layers=0, |
| verbose=False, |
| ) |
| print(f" Loaded in {time.time()-start:.1f}s") |
|
|
| all_results["base"] = run_test("BASE MODEL (no adapter)", llm_base, TEST_PROMPTS) |
| del llm_base |
|
|
| |
| print("\n\nLoading BASE + NEWTON adapter...") |
| start = time.time() |
| llm_newton = Llama( |
| model_path=BASE_GGUF, |
| lora_path=NEWTON_LORA, |
| n_ctx=2048, |
| n_gpu_layers=0, |
| verbose=False, |
| ) |
| print(f" Loaded in {time.time()-start:.1f}s") |
|
|
| all_results["newton"] = run_test("NEWTON ADAPTER", llm_newton, TEST_PROMPTS) |
| del llm_newton |
|
|
| |
| print("\n\nLoading BASE + DAVINCI adapter...") |
| start = time.time() |
| llm_davinci = Llama( |
| model_path=BASE_GGUF, |
| lora_path=DAVINCI_LORA, |
| n_ctx=2048, |
| n_gpu_layers=0, |
| verbose=False, |
| ) |
| print(f" Loaded in {time.time()-start:.1f}s") |
|
|
| all_results["davinci"] = run_test("DAVINCI ADAPTER", llm_davinci, TEST_PROMPTS) |
| del llm_davinci |
|
|
| |
| print(f"\n{'=' * 60}") |
| print("COMPARISON SUMMARY") |
| print(f"{'=' * 60}") |
| for tag in ["physics", "philosophy", "systems"]: |
| print(f"\n--- {tag.upper()} ---") |
| for model_name in ["base", "newton", "davinci"]: |
| for r in all_results[model_name]: |
| if r["tag"] == tag: |
| short = r["response"][:120] + "..." if len(r["response"]) > 120 else r["response"] |
| print(f" {model_name:8s}: {short}") |
|
|
| print(f"\n{'=' * 60}") |
| print("TEST COMPLETE") |
| print(f"{'=' * 60}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|