| | |
| | """ |
| | test_statistical_e2e.py — Statistically rigorous JIT LoRA training evaluation. |
| | |
| | Dynamically loads real-world facts (post model training cutoff), pre-tests each |
| | against the model to confirm it's truly unknown, trains via LoRA, then evaluates |
| | with proper statistical analysis across multiple independent trials. |
| | |
| | Usage: |
| | # Ensure daemon is running with model activated |
| | python3 test_statistical_e2e.py |
| | |
| | # Custom options |
| | python3 test_statistical_e2e.py --facts-file raw_facts_2026.txt --trials 3 --max-facts 80 |
| | |
| | Data source: facts are loaded from a file generated by web search (not hardcoded). |
| | The file format is: |
| | CATEGORY: <category> |
| | Q: <question> |
| | A: <answer> |
| | KEYWORDS: <comma-separated keywords> |
| | """ |
| |
|
| | import argparse |
| | import json |
| | import math |
| | import os |
| | import random |
| | import re |
| | import statistics |
| | import sys |
| | import time |
| | from dataclasses import dataclass, field |
| | from pathlib import Path |
| | from typing import Optional |
| |
|
| | import requests |
| |
|
| | |
| |
|
| | DAEMON_URL = "http://localhost:8766" |
| | DEFAULT_FACTS_FILE = os.path.join(os.path.dirname(__file__), "raw_facts_2026.txt") |
| | MAX_TOKENS_PRETEST = 80 |
| | MAX_TOKENS_POSTTEST = 100 |
| | TRAIN_EPOCHS = 15 |
| | REGULARIZATION_RATIO = 0.33 |
| |
|
| |
|
| | |
| |
|
| | @dataclass |
| | class Fact: |
| | category: str |
| | question: str |
| | answer: str |
| | keywords: list |
| | pretest_response: str = "" |
| | pretest_known: bool = False |
| |
|
| | def to_training_pair(self): |
| | return {"user": self.question, "assistant": self.answer} |
| |
|
| |
|
| | @dataclass |
| | class TrialResult: |
| | trial_id: int |
| | n_facts_tested: int |
| | n_confirmed_unknown: int |
| | n_training_pairs: int |
| | n_regularization_pairs: int |
| | training_steps: int |
| | training_time_s: float |
| | initial_loss: float |
| | final_loss: float |
| | |
| | recall_correct: int |
| | recall_total: int |
| | general_correct: int |
| | general_total: int |
| | |
| | category_scores: dict = field(default_factory=dict) |
| |
|
| |
|
| | |
| |
|
| | def load_facts_from_file(filepath: str) -> list: |
| | """Parse the raw facts file into Fact objects.""" |
| | facts = [] |
| | current = {} |
| |
|
| | with open(filepath) as f: |
| | for line in f: |
| | line = line.strip() |
| | if not line or line.startswith("#") or line.startswith("="): |
| | continue |
| |
|
| | if line.startswith("CATEGORY:"): |
| | if current.get("question"): |
| | facts.append(Fact( |
| | category=current.get("category", "Unknown"), |
| | question=current["question"], |
| | answer=current.get("answer", ""), |
| | keywords=[k.strip().lower() for k in current.get("keywords", "").split(",") if k.strip()], |
| | )) |
| | current = {"category": line.split(":", 1)[1].strip()} |
| |
|
| | elif line.startswith("Q:"): |
| | |
| | if current.get("question"): |
| | facts.append(Fact( |
| | category=current.get("category", "Unknown"), |
| | question=current["question"], |
| | answer=current.get("answer", ""), |
| | keywords=[k.strip().lower() for k in current.get("keywords", "").split(",") if k.strip()], |
| | )) |
| | cat = current.get("category", "Unknown") |
| | current = {"category": cat} |
| | current["question"] = line[2:].strip() |
| |
|
| | elif line.startswith("A:"): |
| | current["answer"] = line[2:].strip() |
| |
|
| | elif line.startswith("KEYWORDS:"): |
| | current["keywords"] = line[9:].strip() |
| |
|
| | |
| | if current.get("question"): |
| | facts.append(Fact( |
| | category=current.get("category", "Unknown"), |
| | question=current["question"], |
| | answer=current.get("answer", ""), |
| | keywords=[k.strip().lower() for k in current.get("keywords", "").split(",") if k.strip()], |
| | )) |
| |
|
| | return facts |
| |
|
| |
|
| | |
| |
|
| | GENERAL_KNOWLEDGE = [ |
| | {"question": "What is the capital of France?", "keywords": ["paris"]}, |
| | {"question": "Who wrote Romeo and Juliet?", "keywords": ["shakespeare"]}, |
| | {"question": "What is the chemical symbol for water?", "keywords": ["h2o"]}, |
| | {"question": "What planet is closest to the Sun?", "keywords": ["mercury"]}, |
| | {"question": "What year did World War II end?", "keywords": ["1945"]}, |
| | {"question": "What is the speed of light in km/s approximately?", "keywords": ["299", "km"]}, |
| | {"question": "Who painted the Mona Lisa?", "keywords": ["vinci", "leonardo"]}, |
| | {"question": "What is the largest ocean on Earth?", "keywords": ["pacific"]}, |
| | {"question": "What gas do plants absorb from the atmosphere?", "keywords": ["co2", "carbon dioxide"]}, |
| | {"question": "What is the square root of 144?", "keywords": ["12"]}, |
| | {"question": "Who developed the theory of general relativity?", "keywords": ["einstein"]}, |
| | {"question": "What is the capital of Japan?", "keywords": ["tokyo"]}, |
| | {"question": "How many chromosomes do humans have?", "keywords": ["46", "23 pairs"]}, |
| | {"question": "What element has the atomic number 1?", "keywords": ["hydrogen"]}, |
| | {"question": "Who was the first person to walk on the Moon?", "keywords": ["armstrong"]}, |
| | {"question": "What is the boiling point of water in Celsius?", "keywords": ["100"]}, |
| | {"question": "What is the capital of Australia?", "keywords": ["canberra"]}, |
| | {"question": "What year was the United Nations founded?", "keywords": ["1945"]}, |
| | {"question": "What is the chemical formula for table salt?", "keywords": ["nacl"]}, |
| | {"question": "Who wrote 1984?", "keywords": ["orwell"]}, |
| | ] |
| |
|
| | |
| |
|
| | REGULARIZATION_PAIRS = [ |
| | {"user": "What is the capital of France?", "assistant": "The capital of France is Paris."}, |
| | {"user": "Who wrote Romeo and Juliet?", "assistant": "William Shakespeare wrote Romeo and Juliet."}, |
| | {"user": "What is the chemical symbol for water?", "assistant": "The chemical symbol for water is H2O."}, |
| | {"user": "What planet is closest to the Sun?", "assistant": "Mercury is the closest planet to the Sun."}, |
| | {"user": "What year did World War II end?", "assistant": "World War II ended in 1945."}, |
| | {"user": "Who painted the Mona Lisa?", "assistant": "Leonardo da Vinci painted the Mona Lisa."}, |
| | {"user": "What is the largest ocean on Earth?", "assistant": "The Pacific Ocean is the largest ocean on Earth."}, |
| | {"user": "What gas do plants absorb from the atmosphere?", "assistant": "Plants absorb carbon dioxide (CO2) from the atmosphere."}, |
| | {"user": "What is the square root of 144?", "assistant": "The square root of 144 is 12."}, |
| | {"user": "Who developed the theory of general relativity?", "assistant": "Albert Einstein developed the theory of general relativity."}, |
| | {"user": "What is the capital of Japan?", "assistant": "The capital of Japan is Tokyo."}, |
| | {"user": "How many chromosomes do humans have?", "assistant": "Humans have 46 chromosomes, or 23 pairs."}, |
| | {"user": "What element has the atomic number 1?", "assistant": "Hydrogen has the atomic number 1."}, |
| | {"user": "Who was the first person to walk on the Moon?", "assistant": "Neil Armstrong was the first person to walk on the Moon in 1969."}, |
| | {"user": "What is the boiling point of water in Celsius?", "assistant": "The boiling point of water is 100 degrees Celsius."}, |
| | {"user": "What is the capital of Australia?", "assistant": "The capital of Australia is Canberra."}, |
| | {"user": "What year was the United Nations founded?", "assistant": "The United Nations was founded in 1945."}, |
| | {"user": "What is the chemical formula for table salt?", "assistant": "The chemical formula for table salt is NaCl (sodium chloride)."}, |
| | {"user": "Who wrote the novel 1984?", "assistant": "George Orwell wrote the novel 1984."}, |
| | {"user": "What is the tallest mountain in the world?", "assistant": "Mount Everest is the tallest mountain in the world at 8,849 meters."}, |
| | ] |
| |
|
| |
|
| | |
| |
|
| | def daemon_status(): |
| | r = requests.get(f"{DAEMON_URL}/status", timeout=10) |
| | r.raise_for_status() |
| | return r.json() |
| |
|
| |
|
| | def daemon_reset(retries=3): |
| | """Reset adapter and data buffers for a clean trial.""" |
| | for attempt in range(retries): |
| | try: |
| | r = requests.post(f"{DAEMON_URL}/reset", json={"clear_data": True}, timeout=60) |
| | r.raise_for_status() |
| | return r.json() |
| | except Exception as e: |
| | if attempt < retries - 1: |
| | print(f" Reset attempt {attempt+1} failed: {e}, retrying in 5s...") |
| | time.sleep(5) |
| | else: |
| | raise |
| |
|
| |
|
| | def daemon_query(question: str, max_tokens: int = 100) -> str: |
| | """Query the model and collect the full response.""" |
| | try: |
| | r = requests.post( |
| | f"{DAEMON_URL}/chat", |
| | json={"messages": [{"role": "user", "content": question}], |
| | "max_tokens": max_tokens, "stream": True}, |
| | stream=True, timeout=180, |
| | ) |
| | r.raise_for_status() |
| | except Exception as e: |
| | print(f" [Query error: {e}]") |
| | return "" |
| |
|
| | text = "" |
| | try: |
| | for line in r.iter_lines(decode_unicode=True): |
| | if not line or not line.startswith("data: "): |
| | continue |
| | payload = line[6:].strip() |
| | if payload == "[DONE]": |
| | break |
| | try: |
| | obj = json.loads(payload) |
| | delta = obj.get("choices", [{}])[0].get("delta", {}) |
| | content = delta.get("content", "") |
| | |
| | if content and not content.startswith("<|"): |
| | text += content |
| | except json.JSONDecodeError: |
| | continue |
| | except Exception as e: |
| | print(f" [Stream error: {e}, got so far: {text[:50]}]") |
| |
|
| | return text.strip() |
| |
|
| |
|
| | def daemon_inject_and_train(training_pairs: list, epochs: int = TRAIN_EPOCHS) -> dict: |
| | """Inject training data and run epoch-based training. |
| | |
| | Converts {"user": ..., "assistant": ...} pairs to the daemon's expected format: |
| | [{"role": "user", "content": ...}, {"role": "assistant", "content": ...}] |
| | |
| | The /train endpoint is async — it starts training in background and returns immediately. |
| | We poll /status until training completes. |
| | """ |
| | |
| | messages = [] |
| | for pair in training_pairs: |
| | messages.append([ |
| | {"role": "user", "content": pair["user"]}, |
| | {"role": "assistant", "content": pair["assistant"]}, |
| | ]) |
| |
|
| | r = requests.post( |
| | f"{DAEMON_URL}/train", |
| | json={"messages": messages, "epochs": epochs}, |
| | timeout=30, |
| | ) |
| | r.raise_for_status() |
| | start_response = r.json() |
| | print(f" Train started: injected={start_response.get('injected', 0)}, epochs={start_response.get('epochs', 0)}") |
| |
|
| | |
| | poll_interval = 2 |
| | max_wait = 600 |
| | elapsed = 0 |
| | last_steps = 0 |
| | result = {"steps": 0, "final_loss": 0, "initial_loss": 0, "epochs_completed": 0, "early_stopped": False} |
| |
|
| | while elapsed < max_wait: |
| | time.sleep(poll_interval) |
| | elapsed += poll_interval |
| | try: |
| | status = daemon_status() |
| | current_steps = status.get("total_steps", 0) |
| | current_loss = status.get("last_loss", 0) |
| |
|
| | if current_steps != last_steps: |
| | last_steps = current_steps |
| |
|
| | if not status.get("training", False): |
| | |
| | result["steps"] = status.get("total_steps", 0) |
| | result["final_loss"] = status.get("last_loss", 0) |
| | result["initial_loss"] = result.get("initial_loss", current_loss) |
| | break |
| |
|
| | |
| | if result["initial_loss"] == 0 and current_loss > 0: |
| | result["initial_loss"] = current_loss |
| |
|
| | if elapsed % 30 == 0: |
| | print(f" ... training: step={current_steps}, loss={current_loss:.4f}") |
| | except Exception as e: |
| | print(f" [Poll error: {e}]") |
| |
|
| | return result |
| |
|
| |
|
| | def daemon_set_auto_train(enabled: bool): |
| | """Enable/disable auto_train on the daemon.""" |
| | try: |
| | r = requests.put( |
| | f"{DAEMON_URL}/config", |
| | json={"auto_train": enabled}, |
| | timeout=10, |
| | ) |
| | r.raise_for_status() |
| | except Exception as e: |
| | print(f" [Warning: could not set auto_train={enabled}: {e}]") |
| |
|
| |
|
| | |
| |
|
| | def normalize_unicode(text: str) -> str: |
| | """Normalize Unicode subscripts/superscripts to ASCII equivalents.""" |
| | import unicodedata |
| | |
| | replacements = { |
| | '₂': '2', '₃': '3', '₄': '4', '₅': '5', '₆': '6', |
| | '₀': '0', '₁': '1', '₇': '7', '₈': '8', '₉': '9', |
| | '²': '2', '³': '3', '⁴': '4', '⁵': '5', '⁶': '6', |
| | '⁰': '0', '¹': '1', '⁷': '7', '⁸': '8', '⁹': '9', |
| | } |
| | for old, new in replacements.items(): |
| | text = text.replace(old, new) |
| | return text |
| |
|
| |
|
| | def check_keywords(response: str, keywords: list, min_matches: int = 2) -> bool: |
| | """Check if response contains enough of the expected keywords. |
| | |
| | Requires at least `min_matches` keywords to match to avoid false positives |
| | from base models that hallucinate topic-relevant but factually wrong responses. |
| | For short keyword lists (<=2), requires all to match. |
| | """ |
| | if not keywords: |
| | return False |
| | response_lower = normalize_unicode(response.lower()) |
| | matches = sum(1 for kw in keywords if kw in response_lower) |
| | required = min(min_matches, len(keywords)) |
| | return matches >= required |
| |
|
| |
|
| | def pretest_facts(facts: list) -> tuple: |
| | """Pre-test all facts against the model. Return (unknown, known) split.""" |
| | unknown = [] |
| | known = [] |
| |
|
| | print(f"\n Pre-testing {len(facts)} facts against model...") |
| | for i, fact in enumerate(facts): |
| | response = daemon_query(fact.question, max_tokens=MAX_TOKENS_PRETEST) |
| | fact.pretest_response = response |
| | fact.pretest_known = check_keywords(response, fact.keywords) |
| |
|
| | status = "KNOWN" if fact.pretest_known else "unknown" |
| | if (i + 1) % 10 == 0 or fact.pretest_known: |
| | print(f" [{i+1}/{len(facts)}] {status}: {fact.question[:60]}...") |
| |
|
| | if fact.pretest_known: |
| | known.append(fact) |
| | else: |
| | unknown.append(fact) |
| |
|
| | print(f" Pre-test complete: {len(unknown)} unknown, {len(known)} already known") |
| | return unknown, known |
| |
|
| |
|
| | def evaluate_recall(facts: list) -> list: |
| | """Post-training: test recall of each fact. Returns list of (fact, correct, response).""" |
| | results = [] |
| | for i, fact in enumerate(facts): |
| | response = daemon_query(fact.question, max_tokens=MAX_TOKENS_POSTTEST) |
| | correct = check_keywords(response, fact.keywords) |
| | results.append((fact, correct, response)) |
| | if (i + 1) % 10 == 0: |
| | print(f" [{i+1}/{len(facts)}] recall testing...") |
| | return results |
| |
|
| |
|
| | def evaluate_general_knowledge() -> list: |
| | """Test general knowledge preservation.""" |
| | results = [] |
| | for item in GENERAL_KNOWLEDGE: |
| | response = daemon_query(item["question"], max_tokens=100) |
| | correct = check_keywords(response, item["keywords"]) |
| | results.append((item, correct, response)) |
| | return results |
| |
|
| |
|
| | |
| |
|
| | def clopper_pearson(k: int, n: int, alpha: float = 0.05) -> tuple: |
| | """Clopper-Pearson exact binomial confidence interval.""" |
| | if n == 0: |
| | return (0.0, 0.0) |
| | from scipy import stats as scipy_stats |
| | lo = scipy_stats.beta.ppf(alpha / 2, k, n - k + 1) if k > 0 else 0.0 |
| | hi = scipy_stats.beta.ppf(1 - alpha / 2, k + 1, n - k) if k < n else 1.0 |
| | return (lo, hi) |
| |
|
| |
|
| | def wilson_interval(k: int, n: int, z: float = 1.96) -> tuple: |
| | """Wilson score confidence interval (no scipy needed).""" |
| | if n == 0: |
| | return (0.0, 0.0) |
| | p_hat = k / n |
| | denom = 1 + z**2 / n |
| | center = (p_hat + z**2 / (2 * n)) / denom |
| | margin = z * math.sqrt((p_hat * (1 - p_hat) + z**2 / (4 * n)) / n) / denom |
| | return (max(0.0, center - margin), min(1.0, center + margin)) |
| |
|
| |
|
| | |
| |
|
| | def run_trial(facts: list, trial_id: int, epochs: int = TRAIN_EPOCHS) -> TrialResult: |
| | """Run a single trial: reset → pre-test → train → evaluate.""" |
| | print(f"\n{'='*70}") |
| | print(f" TRIAL {trial_id}") |
| | print(f"{'='*70}") |
| |
|
| | |
| | print(" Resetting adapter and data buffers...") |
| | daemon_reset() |
| | time.sleep(2) |
| |
|
| | |
| | unknown_facts, known_facts = pretest_facts(facts) |
| |
|
| | if len(unknown_facts) < 10: |
| | print(f" WARNING: Only {len(unknown_facts)} unknown facts — insufficient for evaluation") |
| | |
| |
|
| | |
| | novel_pairs = [f.to_training_pair() for f in unknown_facts] |
| |
|
| | |
| | n_reg_needed = max(1, int(len(novel_pairs) * REGULARIZATION_RATIO / (1 - REGULARIZATION_RATIO))) |
| | n_reg_used = min(n_reg_needed, len(REGULARIZATION_PAIRS)) |
| | reg_pairs = REGULARIZATION_PAIRS[:n_reg_used] |
| |
|
| | all_pairs = novel_pairs + reg_pairs |
| | random.shuffle(all_pairs) |
| |
|
| | print(f" Training data: {len(novel_pairs)} novel + {n_reg_used} regularization = {len(all_pairs)} total") |
| | print(f" Regularization ratio: {n_reg_used / len(all_pairs) * 100:.1f}%") |
| |
|
| | |
| | print(f" Training ({epochs} epochs max, early stopping enabled)...") |
| | t0 = time.time() |
| | train_result = daemon_inject_and_train(all_pairs, epochs=epochs) |
| | train_time = time.time() - t0 |
| | print(f" Training complete: {train_time:.1f}s") |
| | print(f" {json.dumps({k: train_result.get(k) for k in ['steps', 'final_loss', 'initial_loss', 'epochs_completed', 'early_stopped']}, default=str)}") |
| |
|
| | time.sleep(2) |
| |
|
| | |
| | print(f"\n Evaluating recall ({len(unknown_facts)} facts)...") |
| | recall_results = evaluate_recall(unknown_facts) |
| | recall_correct = sum(1 for _, c, _ in recall_results if c) |
| |
|
| | |
| | print(f" Evaluating general knowledge ({len(GENERAL_KNOWLEDGE)} questions)...") |
| | gen_results = evaluate_general_knowledge() |
| | gen_correct = sum(1 for _, c, _ in gen_results if c) |
| |
|
| | |
| | category_scores = {} |
| | for fact, correct, _ in recall_results: |
| | cat = fact.category |
| | if cat not in category_scores: |
| | category_scores[cat] = {"correct": 0, "total": 0} |
| | category_scores[cat]["total"] += 1 |
| | if correct: |
| | category_scores[cat]["correct"] += 1 |
| |
|
| | result = TrialResult( |
| | trial_id=trial_id, |
| | n_facts_tested=len(facts), |
| | n_confirmed_unknown=len(unknown_facts), |
| | n_training_pairs=len(all_pairs), |
| | n_regularization_pairs=n_reg_used, |
| | training_steps=train_result.get("steps", 0), |
| | training_time_s=train_time, |
| | initial_loss=train_result.get("initial_loss", 0), |
| | final_loss=train_result.get("final_loss", 0), |
| | recall_correct=recall_correct, |
| | recall_total=len(unknown_facts), |
| | general_correct=gen_correct, |
| | general_total=len(GENERAL_KNOWLEDGE), |
| | category_scores=category_scores, |
| | ) |
| |
|
| | |
| | print(f"\n Trial {trial_id} Results:") |
| | print(f" Recall: {recall_correct}/{len(unknown_facts)} ({recall_correct/max(1,len(unknown_facts))*100:.1f}%)") |
| | print(f" General Knowledge: {gen_correct}/{len(GENERAL_KNOWLEDGE)} ({gen_correct/len(GENERAL_KNOWLEDGE)*100:.1f}%)") |
| | print(f" Training: {result.training_steps} steps, {train_time:.1f}s, loss {result.initial_loss:.3f} → {result.final_loss:.3f}") |
| |
|
| | |
| | failures = [(f, r) for f, c, r in recall_results if not c] |
| | if failures: |
| | print(f"\n Failed recalls ({len(failures)}):") |
| | for fact, resp in failures[:10]: |
| | print(f" Q: {fact.question[:70]}") |
| | print(f" Expected keywords: {fact.keywords}") |
| | print(f" Got: {resp[:100]}") |
| | print() |
| |
|
| | gen_failures = [(item, r) for item, c, r in gen_results if not c] |
| | if gen_failures: |
| | print(f" General knowledge failures ({len(gen_failures)}):") |
| | for item, resp in gen_failures: |
| | print(f" Q: {item['question']}") |
| | print(f" Expected: {item['keywords']}") |
| | print(f" Got: {resp[:100]}") |
| |
|
| | return result |
| |
|
| |
|
| | def run_trial_prefiltered(unknown_facts: list, trial_id: int, epochs: int = TRAIN_EPOCHS) -> TrialResult: |
| | """Run a trial with pre-filtered facts (already confirmed unknown). Skips pre-testing.""" |
| | print(f"\n{'='*70}") |
| | print(f" TRIAL {trial_id}") |
| | print(f"{'='*70}") |
| |
|
| | |
| | print(" Resetting adapter and data buffers...") |
| | daemon_reset() |
| | time.sleep(2) |
| |
|
| | |
| | novel_pairs = [f.to_training_pair() for f in unknown_facts] |
| |
|
| | |
| | n_reg_needed = max(1, int(len(novel_pairs) * REGULARIZATION_RATIO / (1 - REGULARIZATION_RATIO))) |
| | n_reg_used = min(n_reg_needed, len(REGULARIZATION_PAIRS)) |
| | reg_pairs = REGULARIZATION_PAIRS[:n_reg_used] |
| |
|
| | all_pairs = novel_pairs + reg_pairs |
| | random.shuffle(all_pairs) |
| |
|
| | print(f" Training data: {len(novel_pairs)} novel + {n_reg_used} regularization = {len(all_pairs)} total") |
| | print(f" Regularization ratio: {n_reg_used / len(all_pairs) * 100:.1f}%") |
| |
|
| | |
| | print(f" Training ({epochs} epochs max, early stopping enabled)...") |
| | t0 = time.time() |
| | train_result = daemon_inject_and_train(all_pairs, epochs=epochs) |
| | train_time = time.time() - t0 |
| | print(f" Training complete: {train_time:.1f}s") |
| | print(f" {json.dumps({k: train_result.get(k) for k in ['steps', 'final_loss', 'initial_loss', 'epochs_completed', 'early_stopped']}, default=str)}") |
| |
|
| | time.sleep(2) |
| |
|
| | |
| | daemon_set_auto_train(False) |
| | print(f"\n Evaluating recall ({len(unknown_facts)} facts)...") |
| | recall_results = evaluate_recall(unknown_facts) |
| | recall_correct = sum(1 for _, c, _ in recall_results if c) |
| |
|
| | |
| | print(f" Evaluating general knowledge ({len(GENERAL_KNOWLEDGE)} questions)...") |
| | gen_results = evaluate_general_knowledge() |
| | gen_correct = sum(1 for _, c, _ in gen_results if c) |
| |
|
| | |
| | category_scores = {} |
| | for fact, correct, _ in recall_results: |
| | cat = fact.category |
| | if cat not in category_scores: |
| | category_scores[cat] = {"correct": 0, "total": 0} |
| | category_scores[cat]["total"] += 1 |
| | if correct: |
| | category_scores[cat]["correct"] += 1 |
| |
|
| | result = TrialResult( |
| | trial_id=trial_id, |
| | n_facts_tested=len(unknown_facts), |
| | n_confirmed_unknown=len(unknown_facts), |
| | n_training_pairs=len(all_pairs), |
| | n_regularization_pairs=n_reg_used, |
| | training_steps=train_result.get("steps", 0), |
| | training_time_s=train_time, |
| | initial_loss=train_result.get("initial_loss", 0), |
| | final_loss=train_result.get("final_loss", 0), |
| | recall_correct=recall_correct, |
| | recall_total=len(unknown_facts), |
| | general_correct=gen_correct, |
| | general_total=len(GENERAL_KNOWLEDGE), |
| | category_scores=category_scores, |
| | ) |
| |
|
| | |
| | print(f"\n Trial {trial_id} Results:") |
| | print(f" Recall: {recall_correct}/{len(unknown_facts)} ({recall_correct/max(1,len(unknown_facts))*100:.1f}%)") |
| | print(f" General Knowledge: {gen_correct}/{len(GENERAL_KNOWLEDGE)} ({gen_correct/len(GENERAL_KNOWLEDGE)*100:.1f}%)") |
| | print(f" Training: {result.training_steps} steps, {train_time:.1f}s, loss {result.initial_loss:.3f} → {result.final_loss:.3f}") |
| |
|
| | |
| | failures = [(f, r) for f, c, r in recall_results if not c] |
| | if failures: |
| | print(f"\n Failed recalls ({len(failures)}):") |
| | for fact, resp in failures[:10]: |
| | print(f" Q: {fact.question[:70]}") |
| | print(f" Expected keywords: {fact.keywords}") |
| | print(f" Got: {resp[:100]}") |
| | print() |
| |
|
| | gen_failures = [(item, r) for item, c, r in gen_results if not c] |
| | if gen_failures: |
| | print(f" General knowledge failures ({len(gen_failures)}):") |
| | for item, resp in gen_failures: |
| | print(f" Q: {item['question']}") |
| | print(f" Expected: {item['keywords']}") |
| | print(f" Got: {resp[:100]}") |
| |
|
| | return result |
| |
|
| |
|
| | |
| |
|
| | def run_evaluation(facts: list, n_trials: int = 3, epochs: int = TRAIN_EPOCHS): |
| | """Run multiple independent trials and report aggregate statistics.""" |
| | print(f"\n{'#'*70}") |
| | print(f" STATISTICAL JIT LoRA EVALUATION") |
| | print(f" Model: {daemon_status()['model_key']}") |
| | print(f" Facts available: {len(facts)}") |
| | print(f" Trials: {n_trials}") |
| | print(f" Epochs: {epochs} (with early stopping)") |
| | print(f" Regularization target: {REGULARIZATION_RATIO*100:.0f}%") |
| | print(f"{'#'*70}") |
| |
|
| | |
| | daemon_set_auto_train(False) |
| |
|
| | |
| | print(f"\n === Pre-testing all {len(facts)} facts (one-time baseline) ===") |
| | daemon_reset() |
| | time.sleep(2) |
| | unknown_facts, known_facts = pretest_facts(facts) |
| | print(f"\n Baseline: {len(unknown_facts)} confirmed unknown, {len(known_facts)} already known") |
| | print(f" Will train on {len(unknown_facts)} unknown facts across {n_trials} trials\n") |
| |
|
| | if len(unknown_facts) < 10: |
| | print(" ERROR: Too few unknown facts for meaningful evaluation.") |
| | print(" The model already knows most of the dataset.") |
| | return None |
| |
|
| | results = [] |
| | for trial in range(1, n_trials + 1): |
| | |
| | trial_unknown = unknown_facts.copy() |
| | random.shuffle(trial_unknown) |
| | result = run_trial_prefiltered(trial_unknown, trial, epochs) |
| | results.append(result) |
| |
|
| | |
| | print(f"\n{'='*70}") |
| | print(f" AGGREGATE RESULTS ({n_trials} trials)") |
| | print(f"{'='*70}") |
| |
|
| | |
| | recall_rates = [r.recall_correct / max(1, r.recall_total) for r in results] |
| | general_rates = [r.general_correct / max(1, r.general_total) for r in results] |
| | training_times = [r.training_time_s for r in results] |
| | training_steps_list = [r.training_steps for r in results] |
| | n_unknown_list = [r.n_confirmed_unknown for r in results] |
| |
|
| | |
| | pooled_recall_k = sum(r.recall_correct for r in results) |
| | pooled_recall_n = sum(r.recall_total for r in results) |
| | pooled_gen_k = sum(r.general_correct for r in results) |
| | pooled_gen_n = sum(r.general_total for r in results) |
| |
|
| | recall_ci = wilson_interval(pooled_recall_k, pooled_recall_n) |
| | general_ci = wilson_interval(pooled_gen_k, pooled_gen_n) |
| |
|
| | print(f"\n Confirmed unknown facts per trial: {n_unknown_list}") |
| | print(f" (facts the model verified it did NOT know before training)") |
| |
|
| | print(f"\n ┌─────────────────────────────────────────────────────────────────┐") |
| | print(f" │ RECALL (post-training) │") |
| | print(f" │ Pooled: {pooled_recall_k}/{pooled_recall_n} ({pooled_recall_k/max(1,pooled_recall_n)*100:.1f}%) │") |
| | print(f" │ Per-trial rates: {[f'{r:.1%}' for r in recall_rates]}") |
| | if n_trials > 1 and len(recall_rates) > 1: |
| | print(f" │ Mean ± StdDev: {statistics.mean(recall_rates):.1%} ± {statistics.stdev(recall_rates):.1%}") |
| | print(f" │ 95% CI (Wilson): [{recall_ci[0]:.1%}, {recall_ci[1]:.1%}]") |
| | print(f" │ │") |
| | print(f" │ GENERAL KNOWLEDGE (preservation) │") |
| | print(f" │ Pooled: {pooled_gen_k}/{pooled_gen_n} ({pooled_gen_k/max(1,pooled_gen_n)*100:.1f}%) │") |
| | print(f" │ Per-trial rates: {[f'{r:.1%}' for r in general_rates]}") |
| | if n_trials > 1 and len(general_rates) > 1: |
| | print(f" │ Mean ± StdDev: {statistics.mean(general_rates):.1%} ± {statistics.stdev(general_rates):.1%}") |
| | print(f" │ 95% CI (Wilson): [{general_ci[0]:.1%}, {general_ci[1]:.1%}]") |
| | print(f" │ │") |
| | print(f" │ TRAINING │") |
| | print(f" │ Mean time: {statistics.mean(training_times):.1f}s ± {statistics.stdev(training_times) if len(training_times) > 1 else 0:.1f}s") |
| | print(f" │ Mean steps: {statistics.mean(training_steps_list):.0f}") |
| | print(f" └─────────────────────────────────────────────────────────────────┘") |
| |
|
| | |
| | all_categories = set() |
| | for r in results: |
| | all_categories.update(r.category_scores.keys()) |
| |
|
| | print(f"\n Per-Category Recall (pooled across trials):") |
| | print(f" {'Category':<25} {'Correct':>8} {'Total':>8} {'Rate':>8} {'95% CI':>16}") |
| | print(f" {'-'*25} {'-'*8} {'-'*8} {'-'*8} {'-'*16}") |
| |
|
| | for cat in sorted(all_categories): |
| | cat_k = sum(r.category_scores.get(cat, {}).get("correct", 0) for r in results) |
| | cat_n = sum(r.category_scores.get(cat, {}).get("total", 0) for r in results) |
| | if cat_n > 0: |
| | cat_ci = wilson_interval(cat_k, cat_n) |
| | print(f" {cat:<25} {cat_k:>8} {cat_n:>8} {cat_k/cat_n:>8.1%} [{cat_ci[0]:.1%}, {cat_ci[1]:.1%}]") |
| |
|
| | |
| | output = { |
| | "model": daemon_status().get("model_key", "unknown"), |
| | "n_trials": n_trials, |
| | "epochs": epochs, |
| | "regularization_ratio": REGULARIZATION_RATIO, |
| | "aggregate": { |
| | "recall": { |
| | "pooled_correct": pooled_recall_k, |
| | "pooled_total": pooled_recall_n, |
| | "pooled_rate": pooled_recall_k / max(1, pooled_recall_n), |
| | "per_trial_rates": recall_rates, |
| | "mean": statistics.mean(recall_rates), |
| | "stdev": statistics.stdev(recall_rates) if len(recall_rates) > 1 else 0, |
| | "ci_95_lower": recall_ci[0], |
| | "ci_95_upper": recall_ci[1], |
| | }, |
| | "general_knowledge": { |
| | "pooled_correct": pooled_gen_k, |
| | "pooled_total": pooled_gen_n, |
| | "pooled_rate": pooled_gen_k / max(1, pooled_gen_n), |
| | "per_trial_rates": general_rates, |
| | "mean": statistics.mean(general_rates), |
| | "stdev": statistics.stdev(general_rates) if len(general_rates) > 1 else 0, |
| | "ci_95_lower": general_ci[0], |
| | "ci_95_upper": general_ci[1], |
| | }, |
| | "training": { |
| | "mean_time_s": statistics.mean(training_times), |
| | "stdev_time_s": statistics.stdev(training_times) if len(training_times) > 1 else 0, |
| | "mean_steps": statistics.mean(training_steps_list), |
| | "per_trial_times": training_times, |
| | }, |
| | }, |
| | "trials": [ |
| | { |
| | "trial_id": r.trial_id, |
| | "n_confirmed_unknown": r.n_confirmed_unknown, |
| | "n_training_pairs": r.n_training_pairs, |
| | "training_steps": r.training_steps, |
| | "training_time_s": r.training_time_s, |
| | "initial_loss": r.initial_loss, |
| | "final_loss": r.final_loss, |
| | "recall_correct": r.recall_correct, |
| | "recall_total": r.recall_total, |
| | "recall_rate": r.recall_correct / max(1, r.recall_total), |
| | "general_correct": r.general_correct, |
| | "general_total": r.general_total, |
| | "general_rate": r.general_correct / max(1, r.general_total), |
| | "category_scores": r.category_scores, |
| | } |
| | for r in results |
| | ], |
| | } |
| |
|
| | results_path = os.path.join(os.path.dirname(__file__), "evaluation_results.json") |
| | with open(results_path, "w") as f: |
| | json.dump(output, f, indent=2) |
| | print(f"\n Results saved to: {results_path}") |
| |
|
| | return output |
| |
|
| |
|
| | |
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Statistical JIT LoRA evaluation") |
| | parser.add_argument("--facts-file", default=DEFAULT_FACTS_FILE, |
| | help="Path to raw facts file (default: raw_facts_2026.txt)") |
| | parser.add_argument("--trials", type=int, default=3, |
| | help="Number of independent trials (default: 3)") |
| | parser.add_argument("--max-facts", type=int, default=0, |
| | help="Max facts to use (0 = all, default: 0)") |
| | parser.add_argument("--epochs", type=int, default=TRAIN_EPOCHS, |
| | help=f"Training epochs per trial (default: {TRAIN_EPOCHS})") |
| | parser.add_argument("--seed", type=int, default=42, |
| | help="Random seed for reproducibility (default: 42)") |
| | args = parser.parse_args() |
| |
|
| | random.seed(args.seed) |
| |
|
| | |
| | try: |
| | status = daemon_status() |
| | if not status.get("active"): |
| | print("ERROR: Daemon not active. Call /activate first.") |
| | sys.exit(1) |
| | print(f"Daemon OK: {status['model_key']}, {status.get('trainable_params', '?')} trainable params") |
| | except Exception as e: |
| | print(f"ERROR: Cannot reach daemon at {DAEMON_URL}: {e}") |
| | sys.exit(1) |
| |
|
| | |
| | if not os.path.exists(args.facts_file): |
| | print(f"ERROR: Facts file not found: {args.facts_file}") |
| | print("Generate it first by running the web scraper or provide a path.") |
| | sys.exit(1) |
| |
|
| | facts = load_facts_from_file(args.facts_file) |
| | print(f"Loaded {len(facts)} facts from {args.facts_file}") |
| |
|
| | |
| | seen = set() |
| | unique_facts = [] |
| | for f in facts: |
| | key = f.question.lower().strip() |
| | if key not in seen: |
| | seen.add(key) |
| | unique_facts.append(f) |
| | facts = unique_facts |
| | print(f"After dedup: {len(facts)} unique facts") |
| |
|
| | |
| | cats = {} |
| | for f in facts: |
| | cats[f.category] = cats.get(f.category, 0) + 1 |
| | print(f"Categories: {dict(sorted(cats.items()))}") |
| |
|
| | if args.max_facts > 0 and args.max_facts < len(facts): |
| | |
| | facts = random.sample(facts, args.max_facts) |
| | print(f"Sampled down to {len(facts)} facts") |
| |
|
| | |
| | output = run_evaluation(facts, n_trials=args.trials, epochs=args.epochs) |
| |
|
| | |
| | recall_rate = output["aggregate"]["recall"]["mean"] |
| | gen_rate = output["aggregate"]["general_knowledge"]["mean"] |
| |
|
| | print(f"\n{'='*70}") |
| | if recall_rate >= 0.50 and gen_rate >= 0.80: |
| | print(f" ✓ EVALUATION PASSED") |
| | print(f" Recall: {recall_rate:.1%} (≥50% threshold)") |
| | print(f" General Knowledge: {gen_rate:.1%} (≥80% threshold)") |
| | else: |
| | print(f" ✗ EVALUATION BELOW THRESHOLD") |
| | print(f" Recall: {recall_rate:.1%} {'✓' if recall_rate >= 0.50 else '✗ (<50%)'}") |
| | print(f" General Knowledge: {gen_rate:.1%} {'✓' if gen_rate >= 0.80 else '✗ (<80%)'}") |
| | print(f"{'='*70}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|