Spaces:
Sleeping
Sleeping
File size: 8,577 Bytes
1104031 6895900 2bf5583 6895900 2bf5583 1104031 6895900 1104031 45e223f 6895900 1104031 6895900 45e223f 6895900 2bf5583 1104031 e98676b 6895900 2bf5583 6895900 2bf5583 6895900 2bf5583 6895900 2bf5583 6895900 2bf5583 6895900 45e223f 1104031 6895900 1104031 c77ca5f 6895900 45e223f c77ca5f 1104031 45e223f c77ca5f 1104031 6895900 1104031 2bf5583 1104031 2bf5583 1104031 45e223f 6895900 1104031 6895900 45e223f 1104031 6895900 1104031 6895900 45e223f 6895900 1104031 45e223f 1104031 6895900 1104031 45e223f 1104031 45e223f 1104031 6895900 45e223f 6895900 45e223f 6895900 45e223f 1104031 6895900 1104031 6895900 1104031 6895900 1104031 6895900 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | """Interactive eval: run test samples through the local GGUF model.
Requires llama-server running on port 8080:
llama-server -m finetune/models/<model>.gguf -ngl 99 --port 8080 --ctx-size 4096 --log-disable
Uses the /v1/chat/completions endpoint with a messages list. The Qwen3 GGUF
embeds its chat template in metadata, so llama-server applies it automatically.
Usage
-----
uv run finetune/eval_cli.py # prompts for index
uv run finetune/eval_cli.py 5 # run sample at index 5
uv run finetune/eval_cli.py 5 12 20 # run multiple samples
Use --task places for place extraction:
uv run finetune/eval_cli.py --task places 0 5
Override run directory:
uv run finetune/eval_cli.py --run-dir dataset/output/runs/v1 0
"""
from __future__ import annotations
import argparse
import json
import sys
import urllib.error
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
SERVER_URL = "http://localhost:9000"
MAX_TOKENS = 2048
TEMPERATURE = 0.6
DEFAULT_RUN_DIR = Path("dataset/output/runs/v3")
def postprocess_sql(text: str) -> str:
cleaned = text.strip()
if "```sql" in cleaned:
cleaned = cleaned.split("```sql", 1)[1]
if cleaned.startswith("```"):
cleaned = cleaned[3:]
if "```" in cleaned:
cleaned = cleaned.split("```", 1)[0]
return cleaned.strip()
def check_server() -> bool:
try:
urllib.request.urlopen(f"{SERVER_URL}/health", timeout=2)
return True
except Exception:
return False
def chat_complete(messages: list[dict]) -> str:
"""Call llama-server /v1/chat/completions with a messages list."""
payload = json.dumps({
"messages": messages,
"n_predict": MAX_TOKENS,
"temperature": TEMPERATURE,
"chat_template_kwargs": {"enable_thinking": False},
}).encode()
req = urllib.request.Request(
f"{SERVER_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=60) as resp:
return json.loads(resp.read())["choices"][0]["message"]["content"]
def load_samples(run_dir: Path, task: str, split: str = "val") -> list[dict]:
path = run_dir / task / f"{split}.jsonl"
if not path.exists():
print(f"Error: {path} not found")
sys.exit(1)
print(f"Loading {task} samples from: {path}")
with path.open() as f:
return [json.loads(line) for line in f if line.strip()]
def build_raw_prompt(sample: dict) -> str:
"""Reconstruct the plain prompt string from messages format (all turns except assistant)."""
return "\n\n".join(m["content"] for m in sample["messages"][:-1])
def eval_sample(sample: dict, task: str) -> dict:
"""Run a single sample through the server and return a result dict."""
expected = sample["messages"][-1]["content"]
messages = sample["messages"][:-1]
user_content = sample["messages"][-2]["content"]
if "<USER_QUERY>" in user_content:
question = user_content.split("<USER_QUERY>")[-1].split("</USER_QUERY>")[0].strip()
else:
question = user_content[:120]
raw = chat_complete(messages)
predicted = postprocess_sql(raw) if task == "sql" else raw.strip()
return {
"question": question,
"expected": expected,
"predicted": predicted,
"exact_match": predicted.strip() == expected.strip(),
}
def run_sample(sample: dict, task: str, total: int, index: int, verbose: bool = False) -> None:
user_content = sample["messages"][-2]["content"]
if "<USER_QUERY>" in user_content:
question = user_content.split("<USER_QUERY>")[-1].split("</USER_QUERY>")[0].strip()
else:
question = user_content[:120]
header = f" Sample {index}/{total-1} | {task} "
print(f"\n{'β' * 60}")
print(f"{'β' * ((60 - len(header)) // 2)}{header}{'β' * ((60 - len(header)) // 2)}")
print(f"{'β' * 60}")
print(f"\nQuestion: {question}\n")
if verbose:
prompt = build_raw_prompt(sample)
print(f"{'β' * 60}")
print(f"Full prompt ({len(prompt)} chars, ~{len(prompt.split())} words):")
print(f"{'β' * 60}")
print(prompt)
result = eval_sample(sample, task)
print(f"{'β' * 60}")
print("Expected:")
print(f"{'β' * 60}")
print(result["expected"])
print(f"\n{'β' * 60}")
print("Generated:")
print(f"{'β' * 60}")
print(result["predicted"])
print(f"\n{'β' * 60}")
print(f"Match: {'YES' if result['exact_match'] else 'NO'}")
def run_batch(
samples: list[dict],
task: str,
label: str,
output_path: Path,
workers: int = 8,
) -> None:
"""Run all samples concurrently and save results to a JSON file."""
total = len(samples)
results = [None] * total
completed = 0
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {executor.submit(eval_sample, s, task): i for i, s in enumerate(samples)}
for future in as_completed(futures):
i = futures[future]
result = future.result()
results[i] = {"index": i, **result}
completed += 1
if completed % 50 == 0 or completed == total:
print(f"{completed}/{total} done", flush=True)
matches = sum(1 for r in results if r["exact_match"])
exact_match_rate = matches / total if total else 0
output = {
"summary": {
"label": label,
"task": task,
"num_samples": total,
"exact_matches": matches,
"exact_match_rate": exact_match_rate,
"timestamp": datetime.now().isoformat(),
},
"results": results,
}
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w") as f:
json.dump(output, f, indent=2)
print(f"\n{'=' * 60}")
print(f"[{label}] {matches}/{total} exact matches ({100 * exact_match_rate:.1f}%)")
print(f"Results saved to {output_path}")
print(f"{'=' * 60}")
def main() -> None:
parser = argparse.ArgumentParser(description="Interactive eval against llama-server")
parser.add_argument("indices", nargs="*", type=int, help="Sample indices to evaluate")
parser.add_argument("--task", default="sql", choices=["sql", "places"])
parser.add_argument(
"--run-dir",
type=Path,
default=DEFAULT_RUN_DIR,
help="Run directory containing {task}/{split}.jsonl files",
)
parser.add_argument("--split", default="val", choices=["val", "test"], help="Dataset split")
parser.add_argument("--verbose", "-v", action="store_true", help="Print full prompt sent to the model")
parser.add_argument("--all", dest="run_all", action="store_true", help="Run all samples in batch mode")
parser.add_argument("--max-samples", type=int, default=None, help="Limit number of samples (batch mode)")
parser.add_argument("--label", default="local-gguf", help="Label for batch output file")
parser.add_argument("--output", type=Path, default=None, help="Output JSON path (batch mode)")
parser.add_argument("--workers", type=int, default=4, help="Concurrent requests; match llama-server --parallel (default 4)")
args = parser.parse_args()
if not check_server():
print("llama-server not running. Start it with:")
print("llama-server -m finetune/models/<model>.gguf -ngl 99 --port 9000 --ctx-size 2048 --log-disable")
sys.exit(1)
samples = load_samples(args.run_dir, args.task, args.split)
total = len(samples)
if args.run_all:
if args.max_samples:
samples = samples[: args.max_samples]
output_path = args.output or Path(f"eval-{args.label}-{args.task}.json")
print(f"Running batch eval: {len(samples)} samples, {args.workers} workers")
run_batch(samples, args.task, args.label, output_path, workers=args.workers)
return
if not args.indices:
print(f"Test set has {total} {args.task} samples (0-{total-1})")
raw = input("Enter index (or press Enter for 0): ").strip()
indices = [int(raw) if raw else 0]
else:
indices = args.indices
for idx in indices:
if not (0 <= idx < total):
print(f"Index {idx} out of range (0-{total-1}), skipping")
continue
run_sample(samples[idx], args.task, total, idx, verbose=args.verbose)
print(f"\n{'β' * 60}\n")
if __name__ == "__main__":
main()
|