Spaces:

RayMelius
/

soci2

Running

File size: 25,681 Bytes

"""
finetune_local.py — Local adaptation of Soci_FineTune_3_Incremental
Fine-tunes Qwen2.5-0.5B-Instruct on Soci city-simulation tasks using Unsloth.

Differences from the Colab version:
  - No Google Drive / google.colab dependencies
  - Local checkpoint and adapter storage in data/training/
  - Loads live conversation data from data/training/processed/
  - HF token from HF_TOKEN env var (or .env file)
  - --debug flag for quick 1-epoch smoke test (no HF push)
  - --resume flag to continue from saved LoRA adapters

Usage (from project root):
    # Debug / smoke test (fast, no push):
    "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py --debug

    # Full round-1 training on default 0.5b model + push to HF:
    "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py

    # Fine-tune specific model sizes:
    "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py --base-model 7b
    "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py --base-model 8b

    # Resume round 2 for a specific model:
    "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/finetune_local.py --base-model 7b --resume

Model profiles (base model -> HF repo):
    0.5b -> RayMelius/soci-agent-q4   (Qwen2.5-0.5B, batch=2, seq=2048)
    1.5b -> RayMelius/soci-agent-1b5  (Qwen2.5-1.5B, batch=2, seq=2048)
    3b   -> RayMelius/soci-agent-3b   (Qwen2.5-3B,   batch=2, seq=2048)
    7b   -> RayMelius/soci-agent-7b   (Qwen2.5-7B,   batch=1, seq=1024)
    8b   -> RayMelius/soci-agent-8b   (Llama-3.1-8B, batch=1, seq=1024)
"""

from __future__ import annotations

import sys
import io
import os

# Force UTF-8 stdout/stderr on Windows (unsloth prints emoji characters)
if sys.platform == "win32":
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")

# Disable torch.compile/inductor — triton 3.x on Windows doesn't export 'triton_key'
# which inductor needs at compile time.  Training still uses CUDA kernels, just not
# the AOT-compiled fusion path.  Has no meaningful effect on a single-GPU setup.
os.environ.setdefault("TORCHINDUCTOR_DISABLE", "1")
os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")

# Import unsloth FIRST so it can patch transformers before anything else loads.
# Then patch list_repo_templates to skip the 'additional_chat_templates' HF Hub
# check that fails on unsloth's quantized repos (transformers 4.56+ behavior).
import unsloth  # noqa: F401 — must be first
import transformers.utils.hub
import transformers.tokenization_utils_base
_noop = lambda *a, **kw: []
transformers.tokenization_utils_base.list_repo_templates = _noop
transformers.utils.hub.list_repo_templates = _noop

import argparse
import json
import os
import shutil
from datetime import datetime
from pathlib import Path

# ── Parse args first (before heavy imports) ───────────────────────────────────
parser = argparse.ArgumentParser(description="Soci local fine-tune")
parser.add_argument("--resume",     action="store_true", help="Resume from saved LoRA adapters")
parser.add_argument("--debug",      action="store_true", help="Debug/smoke-test: 1 epoch, 20 examples, no push")
parser.add_argument("--no-push",    action="store_true", help="Skip HF Hub push")
parser.add_argument("--no-gguf",    action="store_true", help="Skip GGUF export")
parser.add_argument("--epochs",     type=int, default=None, help="Override epoch count")
parser.add_argument("--hf-repo",    default=None, help="HF repo ID (overrides default)")
parser.add_argument("--base-model", default="0.5b",
                    choices=["0.5b", "1.5b", "3b", "7b", "8b"],
                    help="Base model size to fine-tune (default: 0.5b)")
args = parser.parse_args()

# ── Model profiles (base model → unsloth ID, HF repo, VRAM settings) ──────────
_MODEL_PROFILES = {
    "0.5b": dict(
        model_id      = "unsloth/Qwen2.5-0.5B-Instruct-unsloth-bnb-4bit",
        repo_name     = "soci-agent-q4",
        seq_len       = 2048,
        batch         = 2,
        grad_accum    = 4,
        lora_r        = 16,
        lora_targets  = ["q_proj", "k_proj", "v_proj", "o_proj",
                         "gate_proj", "up_proj", "down_proj"],
    ),
    "1.5b": dict(
        model_id      = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit",
        repo_name     = "soci-agent-1b5",
        seq_len       = 2048,
        batch         = 2,
        grad_accum    = 4,
        lora_r        = 16,
        lora_targets  = ["q_proj", "k_proj", "v_proj", "o_proj",
                         "gate_proj", "up_proj", "down_proj"],
    ),
    "3b": dict(
        model_id      = "unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
        repo_name     = "soci-agent-3b",
        seq_len       = 2048,
        batch         = 2,
        grad_accum    = 4,
        lora_r        = 16,
        lora_targets  = ["q_proj", "k_proj", "v_proj", "o_proj",
                         "gate_proj", "up_proj", "down_proj"],
    ),
    # 7B and 8B: minimal LoRA to stay within 6.4 GB VRAM on RTX 4050 Laptop.
    # 7B in 4-bit uses ~3.8GB; only ~2.6GB left for activations + optimizer.
    # r=8, q+v only → ~5M trainable params, small optimizer footprint.
    "7b": dict(
        model_id      = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
        repo_name     = "soci-agent-7b",
        seq_len       = 512,
        batch         = 1,
        grad_accum    = 8,
        lora_r        = 8,
        lora_targets  = ["q_proj", "v_proj"],
    ),
    "8b": dict(
        model_id      = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
        repo_name     = "soci-agent-8b",
        seq_len       = 512,
        batch         = 1,
        grad_accum    = 8,
        lora_r        = 8,
        lora_targets  = ["q_proj", "v_proj"],
    ),
}
_PROFILE = _MODEL_PROFILES[args.base_model]

# ── Paths (per-model subdirs so runs don't clobber each other) ─────────────────
TRAIN_DIR        = Path("data/training")
MODEL_DIR        = TRAIN_DIR / args.base_model          # e.g. data/training/7b/
LORA_SAVE_DIR    = MODEL_DIR / "lora_adapters"
DATA_ARCHIVE_DIR = MODEL_DIR / "data_archive"
GGUF_DIR         = MODEL_DIR / "gguf"
CHECKPOINTS_DIR  = MODEL_DIR / "checkpoints"
ROUND_FILE       = MODEL_DIR / "training_round.json"
CORE_DATA_FILE   = TRAIN_DIR / "core_examples.json"
LIVE_DATA_FILE   = TRAIN_DIR / "processed" / "soci_training.jsonl"

for d in [LORA_SAVE_DIR, DATA_ARCHIVE_DIR, GGUF_DIR, CHECKPOINTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# ── Config ────────────────────────────────────────────────────────────────────
MAX_SEQ_LENGTH = _PROFILE["seq_len"]
HF_USERNAME    = "RayMelius"
HF_REPO_ID     = args.hf_repo or f"{HF_USERNAME}/{_PROFILE['repo_name']}"

# Load HF token
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass
HF_TOKEN = os.environ.get("HF_TOKEN", "")
if not HF_TOKEN:
    # Try to read from the project .env
    env_file = Path(".env")
    if env_file.exists():
        for line in env_file.read_text().splitlines():
            if line.startswith("HF_TOKEN="):
                HF_TOKEN = line.split("=", 1)[1].strip().strip('"')

# ── GPU check ─────────────────────────────────────────────────────────────────
import torch
if not torch.cuda.is_available():
    print("[WARN] No CUDA GPU detected — training will be very slow on CPU.")
    print("       Consider running on Colab or a machine with a GPU.")
else:
    print(f"GPU : {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# ── Patch unsloth fused CE loss for low-VRAM GPUs ─────────────────────────────
# unsloth_zoo._get_chunk_multiplier checks free VRAM *after* model load.
# On 6.4 GB GPUs the 7B model consumes almost all VRAM, leaving ~0 free,
# which causes it to raise "No or negligible GPU memory available".
# Replace with a version that falls back to 100 MB budget instead of raising.
import functools
import unsloth_zoo.fused_losses.cross_entropy_loss as _unsloth_ce

@functools.cache
def _safe_chunk_multiplier(vocab_size, target_gb=None):
    if target_gb is None:
        try:
            free, _ = torch.cuda.mem_get_info(0)
            free_gb = free / (1024 ** 3) * 0.5
        except Exception:
            free_gb = 0.0
        target_gb = max(free_gb, 0.1)   # always at least 100 MB budget
    if target_gb <= 1e-9:
        target_gb = 0.1
    multiplier = (vocab_size * 4 / (1024 ** 3)) / target_gb
    multiplier = multiplier / 4
    return multiplier

_unsloth_ce._get_chunk_multiplier = _safe_chunk_multiplier
print("Patched unsloth fused CE loss for low-VRAM GPU")

# ── Determine training round ──────────────────────────────────────────────────
RESUME = args.resume
if RESUME and ROUND_FILE.exists():
    round_info = json.loads(ROUND_FILE.read_text())
    CURRENT_ROUND = round_info["round"] + 1
    print(f"Resuming from round {round_info['round']} -> round {CURRENT_ROUND}")
    print(f"Previous loss: {round_info.get('final_loss', 'N/A')}")
elif RESUME:
    CURRENT_ROUND = 2
    print("No round file found, assuming round 2")
else:
    CURRENT_ROUND = 1
    print("Starting fresh (round 1)")

# ── Load model ────────────────────────────────────────────────────────────────
from unsloth import FastLanguageModel  # noqa: already imported via 'import unsloth'

if RESUME and LORA_SAVE_DIR.exists() and any(LORA_SAVE_DIR.iterdir()):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name     = str(LORA_SAVE_DIR),
        max_seq_length = MAX_SEQ_LENGTH,
        dtype          = None,
        load_in_4bit   = True,
    )
    print(f"Resumed LoRA adapters from {LORA_SAVE_DIR}")
else:
    if RESUME:
        print(f"[WARN] No LoRA adapters at {LORA_SAVE_DIR}, starting fresh.")
        CURRENT_ROUND = 1
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name     = _PROFILE["model_id"],
        max_seq_length = MAX_SEQ_LENGTH,
        dtype          = None,
        load_in_4bit   = True,
    )
    print(f"Fresh base model loaded (round 1): {_PROFILE['model_id']}")

# ── Attach LoRA ───────────────────────────────────────────────────────────────
if CURRENT_ROUND == 1:
    model = FastLanguageModel.get_peft_model(
        model,
        r                          = _PROFILE["lora_r"],
        target_modules             = _PROFILE["lora_targets"],
        lora_alpha                 = _PROFILE["lora_r"],   # lora_alpha == r is standard
        lora_dropout               = 0,
        bias                       = "none",
        use_gradient_checkpointing = "unsloth",
        random_state               = 42,
    )
    print("Fresh LoRA adapters attached")
else:
    model.gradient_checkpointing_enable()
    print(f"Resumed LoRA adapters from round {CURRENT_ROUND - 1}")

model.print_trainable_parameters()

# ── System prompt ─────────────────────────────────────────────────────────────
SYSTEM_PROMPT = (
    "You are the reasoning engine for Soci, an LLM-powered city population simulator. "
    "You control AI agents (NPCs) living in a city. Each agent has a persona, needs "
    "(hunger, energy, social, purpose, comfort, fun), memories, and relationships. "
    "You receive structured context and must respond ONLY with valid JSON. "
    "Never add explanation outside the JSON."
)

# ── Load training data ────────────────────────────────────────────────────────
print("\nLoading training data...")

# 1. Core examples (from data/training/core_examples.json, extracted from v3 script)
core_examples: list[dict] = []
if CORE_DATA_FILE.exists():
    core_examples = json.loads(CORE_DATA_FILE.read_text(encoding="utf-8"))
    print(f"  Core examples: {len(core_examples)}")
else:
    print(f"  [WARN] {CORE_DATA_FILE} not found — run extract step or collect_training_data.py first")

# 2. Live collected data from the running simulation
live_examples: list[dict] = []
if LIVE_DATA_FILE.exists():
    with open(LIVE_DATA_FILE, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                ex = json.loads(line)
                # Convert messages format -> instruction/response format
                msgs = ex.get("messages", [])
                if len(msgs) >= 3:
                    # Find system-ish context in user message; use Soci system prompt
                    user_content = msgs[1]["content"]
                    asst_content = msgs[2]["content"]
                    # Prepend persona context from system message as part of instruction
                    persona_ctx = msgs[0]["content"]
                    # Keep persona as part of instruction since we use unified system prompt
                    instruction = f"{persona_ctx}\n\n{user_content}"
                    live_examples.append({
                        "instruction": instruction,
                        "response": asst_content,
                    })
            except (json.JSONDecodeError, KeyError):
                pass
    print(f"  Live examples: {len(live_examples)} (from Render simulation)")

# 3. Replay archived examples from previous rounds
replay_examples: list[dict] = []
if CURRENT_ROUND > 1:
    for archive_f in sorted(DATA_ARCHIVE_DIR.glob("round_*.json")):
        try:
            batch = json.loads(archive_f.read_text(encoding="utf-8"))
            replay_examples.extend(batch)
        except Exception:
            pass
    print(f"  Replay examples: {len(replay_examples)}")

# 4. New examples for this round (add yours here for incremental training)
new_examples_this_round: list[dict] = [
    # Add new instruction/response pairs here for incremental training rounds.
    # Example:
    # {"instruction": "You are playing Diana Novak, 41, grocery store owner. ...",
    #  "response": '{"action": "work", "location": "grocery_store", "reason": "..."}'},
]
if new_examples_this_round:
    print(f"  New examples this round: {len(new_examples_this_round)}")

# Merge and deduplicate by instruction
seen: set[str] = set()
all_examples: list[dict] = []
for ex in core_examples + live_examples + new_examples_this_round + replay_examples:
    key = ex.get("instruction", "")[:100]
    if key not in seen:
        seen.add(key)
        all_examples.append(ex)

if args.debug:
    all_examples = all_examples[:20]
    print(f"  DEBUG mode: using {len(all_examples)} examples")

print(f"  Total (deduped): {len(all_examples)}")

# ── Format into chat template ─────────────────────────────────────────────────
from datasets import Dataset

def format_example(ex: dict) -> dict:
    msgs = [
        {"role": "system",    "content": SYSTEM_PROMPT},
        {"role": "user",      "content": ex["instruction"]},
        {"role": "assistant", "content": ex["response"]},
    ]
    return {"text": tokenizer.apply_chat_template(
        msgs, tokenize=False, add_generation_prompt=False
    )}

dataset = Dataset.from_list(all_examples).map(format_example)
print(f"Formatted {len(dataset)} examples. Sample:")
print(dataset[0]["text"][:400])

# ── Training config ───────────────────────────────────────────────────────────
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported

if args.debug:
    LR, EPOCHS, WARMUP, SCHEDULER = 2e-4, 1, 2, "linear"
    print(f"\nDEBUG: 1 epoch smoke test")
elif CURRENT_ROUND == 1:
    LR, EPOCHS, WARMUP, SCHEDULER = 2e-4, 3, 5, "linear"
    print(f"\nRound 1: Full training — LR={LR}, epochs={EPOCHS}")
else:
    LR, EPOCHS, WARMUP, SCHEDULER = 5e-5, 2, 10, "cosine"
    print(f"\nRound {CURRENT_ROUND}: Incremental — LR={LR}, epochs={EPOCHS}")

if args.epochs is not None:
    EPOCHS = args.epochs
    print(f"Epoch override: {EPOCHS}")

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = dataset,
    dataset_text_field = "text",
    max_seq_length     = MAX_SEQ_LENGTH,
    dataset_num_proc   = 2,
    args = SFTConfig(
        per_device_train_batch_size = _PROFILE["batch"],
        gradient_accumulation_steps = _PROFILE["grad_accum"],
        warmup_steps                = WARMUP,
        num_train_epochs            = EPOCHS,
        learning_rate               = LR,
        fp16                        = not is_bfloat16_supported(),
        bf16                        = is_bfloat16_supported(),
        logging_steps               = 5,
        optim                       = "adamw_8bit",
        weight_decay                = 0.01,
        lr_scheduler_type           = SCHEDULER,
        seed                        = 42,
        output_dir                  = str(CHECKPOINTS_DIR),
        report_to                   = "none",
        dataset_text_field          = "text",
        max_seq_length              = MAX_SEQ_LENGTH,
    ),
)

print(f"\nTraining round {CURRENT_ROUND} on {len(dataset)} examples...")
torch.cuda.empty_cache()   # free any cached fragments before training starts
stats = trainer.train()
print(f"\nRound {CURRENT_ROUND} complete!")
print(f"   Steps: {stats.global_step}  |  Final loss: {stats.training_loss:.4f}")

# ── Save LoRA adapters ────────────────────────────────────────────────────────
print(f"\nSaving LoRA adapters to {LORA_SAVE_DIR}...")
model.save_pretrained(str(LORA_SAVE_DIR))
tokenizer.save_pretrained(str(LORA_SAVE_DIR))
print("  Saved.")

# ── Save round metadata ───────────────────────────────────────────────────────
round_info = {
    "round":          CURRENT_ROUND,
    "final_loss":     stats.training_loss,
    "global_steps":   stats.global_step,
    "total_examples": len(all_examples),
    "new_examples":   len(new_examples_this_round) + len(live_examples),
    "learning_rate":  LR,
    "epochs":         EPOCHS,
    "timestamp":      datetime.now().isoformat(),
}
ROUND_FILE.write_text(json.dumps(round_info, indent=2))
print(f"  Round info: {ROUND_FILE}")

# Archive new examples
all_new = new_examples_this_round + live_examples
if all_new:
    archive_file = DATA_ARCHIVE_DIR / f"round_{CURRENT_ROUND:03d}.json"
    archive_file.write_text(json.dumps(all_new, indent=2, ensure_ascii=False))
    print(f"  Archived {len(all_new)} new examples")

# Training history
history_file = TRAIN_DIR / "training_history.jsonl"
with open(history_file, "a", encoding="utf-8") as f:
    f.write(json.dumps(round_info) + "\n")

# ── Quick inference test ──────────────────────────────────────────────────────
print(f"\n=== Testing after Round {CURRENT_ROUND} ===\n")
FastLanguageModel.for_inference(model)

def ask(question: str, label: str = "") -> None:
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user",   "content": question},
    ]
    encoded = tokenizer.apply_chat_template(
        msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt"
    )
    if hasattr(encoded, "input_ids"):
        inp = encoded.input_ids.to("cuda")
    else:
        inp = encoded.to("cuda")
    out = model.generate(
        input_ids=inp, max_new_tokens=200,
        temperature=0.7, top_p=0.9, do_sample=True,
    )
    resp = tokenizer.decode(out[0][inp.shape[1]:], skip_special_tokens=True)
    print(f"[{label}]")
    print(f"Q: {question[:100]}...")
    try:
        parsed = json.loads(resp)
        print(f"A (valid JSON):\n{json.dumps(parsed, indent=2)}")
    except Exception:
        print(f"A (raw): {resp}")
    print("-" * 60)

ask(
    "You are playing Elena Vasquez, 34, software engineer. "
    "Needs: energy=0.3, hunger=0.7. Location: office. Time: 12:30. "
    "Decide next action. JSON: {\"action\": str, \"location\": str, \"reason\": str}",
    "decide_action",
)
ask(
    "You are playing Marcus Chen talking to Zoe. "
    "Zoe says: 'Marcus, I bombed my exam.' Continue as Marcus. "
    "JSON: {\"speech\": str, \"emotion\": str}",
    "conversation_turn",
)

# ── GGUF export ───────────────────────────────────────────────────────────────
# Windows: unsloth GGUF export requires building llama.cpp via apt-get (Linux only).
# Auto-skip on Windows; use --no-gguf on Linux too if llama.cpp isn't set up.
import platform
_on_windows = platform.system() == "Windows"
skip_gguf = args.no_gguf or args.debug or _on_windows
if _on_windows and not args.no_gguf and not args.debug:
    print("\nSkipping GGUF export (Windows — llama.cpp build not supported via unsloth on Win)")
    print("  To export GGUF manually, use llama.cpp's convert_hf_to_gguf.py")
    print(f"  LoRA merged weights saved to: {GGUF_DIR}/  (after push)")

if not skip_gguf:
    print(f"\nExporting GGUF Q4_K_M (takes a few minutes)...")
    model.save_pretrained_gguf(str(GGUF_DIR), tokenizer, quantization_method="q4_k_m")
    gguf_files = list(GGUF_DIR.glob("*.gguf"))
    for gf in gguf_files:
        print(f"  GGUF: {gf.name}  ({gf.stat().st_size / 1e6:.0f} MB)")
else:
    if args.debug:
        print("\nSkipping GGUF export (debug mode)")
    gguf_files = []

# ── Push to HuggingFace Hub ───────────────────────────────────────────────────
skip_push = args.no_push or args.debug
if skip_push:
    print("\nSkipping HF push (debug mode or --no-push)")
else:
    if not HF_TOKEN:
        print("\n[WARN] No HF_TOKEN found — skipping push.")
        print("  Set HF_TOKEN env var or add to .env file.")
    else:
        from huggingface_hub import login, HfApi
        print(f"\nPushing to HuggingFace: {HF_REPO_ID}")
        login(token=HF_TOKEN)
        api = HfApi()
        api.create_repo(repo_id=HF_REPO_ID, repo_type="model", exist_ok=True)

        # Push LoRA adapters
        print("  Uploading LoRA adapters...")
        api.upload_folder(
            folder_path = str(LORA_SAVE_DIR),
            repo_id     = HF_REPO_ID,
            repo_type   = "model",
            path_in_repo= "lora_adapters",
        )
        print(f"  LoRA -> https://huggingface.co/{HF_REPO_ID}/tree/main/lora_adapters")

        # Push GGUF file(s)
        for gf in gguf_files:
            mb = gf.stat().st_size / 1e6
            print(f"  Uploading {gf.name} ({mb:.0f} MB)...")
            api.upload_file(
                path_or_fileobj = str(gf),
                path_in_repo    = gf.name,
                repo_id         = HF_REPO_ID,
                repo_type       = "model",
            )
            print(f"  Done: https://huggingface.co/{HF_REPO_ID}/blob/main/{gf.name}")

        # Push round metadata
        api.upload_file(
            path_or_fileobj = str(ROUND_FILE),
            path_in_repo    = "training_round.json",
            repo_id         = HF_REPO_ID,
            repo_type       = "model",
        )

        print(f"\nUpload complete! Model at: https://huggingface.co/{HF_REPO_ID}")

# ── Training history display ──────────────────────────────────────────────────
print("\n=== Training History ===\n")
if history_file.exists():
    print(f"{'Round':>6} {'Loss':>8} {'Steps':>7} {'Examples':>9} {'New':>5} {'LR':>10} {'Date':>12}")
    print("-" * 65)
    with open(history_file, encoding="utf-8") as f:
        for line in f:
            r = json.loads(line)
            date = r.get("timestamp", "")[:10]
            print(f"{r['round']:>6} {r['final_loss']:>8.4f} {r['global_steps']:>7} "
                  f"{r['total_examples']:>9} {r['new_examples']:>5} "
                  f"{r['learning_rate']:>10.1e} {date:>12}")

print(f"\nTo resume: python scripts/finetune_local.py --resume")
print(f"LoRA adapters: {LORA_SAVE_DIR}")
if gguf_files:
    print(f"GGUF: {gguf_files[0]}")
print(f"\nOllama integration:")
print(f"  ollama create soci-agent -f Modelfile")
print(f"  set SOCI_PROVIDER=ollama && set OLLAMA_MODEL=soci-agent")