Upload 4 files

Browse files

Files changed (4) hide show

analyze_anli_errors_round1.py +203 -0
build_anli_global_error_buffer_round1.py +91 -0
evaluate_model_hf_only (2).py +403 -0
srl_finetune_round5_smart.py +304 -0

analyze_anli_errors_round1.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import csv
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from torch.nn.functional import softmax
+from datasets import load_dataset
+from tqdm import tqdm
+# ============================
+# CONFIG
+# ============================
+# 🔴 Target model you want to improve
+MODEL_PATH = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\models\student_biomed_kd_fast\adni_srl_round13_smart"
+# Where to save ANLI error buffers
+OUTPUT_DIR = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\AetherMind_for_Alzheimers_Research\data\claims\analysis"
+BATCH_SIZE = 32
+MAX_LENGTH = 192
+LABEL_ID2NAME = {
+    0: "entailment",
+    1: "neutral",
+    2: "contradiction",
+}
+# ============================
+# HELPERS
+# ============================
+def ensure_output_dir(path: str):
+    os.makedirs(path, exist_ok=True)
+def to_label_name(label_id: int) -> str:
+    return LABEL_ID2NAME.get(int(label_id), f"label_{label_id}")
+def compute_error_type(true_id: int, pred_id: int) -> str:
+    if true_id == pred_id:
+        return "correct"
+    return f"{to_label_name(true_id)[0].upper()}->{to_label_name(pred_id)[0].upper()}"
+    # Example: E->N, N->C, C->E
+def run_model_on_dataset(model, tokenizer, data, split_name: str, round_tag: str,
+                         device: torch.device, output_dir: str):
+    """
+    Run the model on a dataset (list of dicts with keys: 'premise','hypothesis','label','id').
+    Save a CSV with detailed per-example info.
+    """
+    rows = []
+    print(f"\n=== Processing ANLI {split_name} ({len(data)} examples) ===")
+    model.eval()
+    with torch.no_grad():
+        for idx in tqdm(range(0, len(data), BATCH_SIZE), desc=f"{split_name} batches"):
+            batch_examples = data[idx:idx + BATCH_SIZE]
+            premises = [ex["premise"] for ex in batch_examples]
+            hypotheses = [ex["hypothesis"] for ex in batch_examples]
+            labels = [int(ex["label"]) for ex in batch_examples]
+            enc = tokenizer(
+                premises,
+                hypotheses,
+                padding=True,
+                truncation=True,
+                max_length=MAX_LENGTH,
+                return_tensors="pt",
+            )
+            input_ids = enc["input_ids"].to(device)
+            attention_mask = enc["attention_mask"].to(device)
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            logits = outputs.logits  # [B, 3]
+            probs = softmax(logits, dim=-1)  # [B, 3]
+            pred_ids = torch.argmax(probs, dim=-1).cpu().tolist()
+            probs_np = probs.cpu().tolist()
+            for i, ex in enumerate(batch_examples):
+                true_id = int(labels[i])
+                pred_id = int(pred_ids[i])
+                prob_vec = probs_np[i]
+                prob_true = float(prob_vec[true_id])
+                is_error = int(true_id != pred_id)
+                err_type = compute_error_type(true_id, pred_id)
+                ex_id = ex.get("id", ex.get("uid", idx + i))
+                rows.append({
+                    "id": ex_id,
+                    "premise": ex["premise"],
+                    "hypothesis": ex["hypothesis"],
+                    "true_label_id": true_id,
+                    "true_label": to_label_name(true_id),
+                    "pred_label_id": pred_id,
+                    "pred_label": to_label_name(pred_id),
+                    "is_error": is_error,
+                    "error_type": err_type,
+                    "logit_entailment": float(prob_vec[0]),
+                    "logit_neutral": float(prob_vec[1]),
+                    "logit_contradiction": float(prob_vec[2]),
+                    "conf_true_label": prob_true,
+                    "difficulty": 1.0 - prob_true,
+                })
+    ensure_output_dir(output_dir)
+    out_path = os.path.join(output_dir, f"anli_error_buffer_{split_name}_{round_tag}.csv")
+    fieldnames = [
+        "id",
+        "premise",
+        "hypothesis",
+        "true_label_id",
+        "true_label",
+        "pred_label_id",
+        "pred_label",
+        "is_error",
+        "error_type",
+        "logit_entailment",
+        "logit_neutral",
+        "logit_contradiction",
+        "conf_true_label",
+        "difficulty",
+    ]
+    with open(out_path, "w", encoding="utf-8", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+    total = len(rows)
+    errors = sum(r["is_error"] for r in rows)
+    acc = 100.0 * (total - errors) / max(1, total)
+    print(f"Saved {total} rows to: {out_path}")
+    print(f"{split_name} accuracy (recomputed here): {acc:.2f}%  (errors={errors})")
+# ============================
+# MAIN
+# ============================
+def main():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    print(f"\nLoading tokenizer and model from: {MODEL_PATH}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
+    model.to(device)
+    # ANLI splits: dev_r1, dev_r2, dev_r3
+    anli_splits = {
+        "anli_r1_dev": "dev_r1",
+        "anli_r2_dev": "dev_r2",
+        "anli_r3_dev": "dev_r3",
+    }
+    for split_name, hf_split in anli_splits.items():
+        print(f"\nLoading ANLI split: {hf_split}")
+        ds = load_dataset("anli", split=hf_split)
+        # Filter out unlabeled (-1) if present and map into a simple list of dicts
+        data = []
+        for ex in ds:
+            label = int(ex["label"])
+            if label < 0:
+                continue
+            data.append({
+                "id": ex.get("uid", None),
+                "premise": ex["premise"],
+                "hypothesis": ex["hypothesis"],
+                "label": label,
+            })
+        print(f"{split_name}: {len(data)} labeled examples")
+        run_model_on_dataset(
+            model=model,
+            tokenizer=tokenizer,
+            data=data,
+            split_name=split_name,      # will appear in filename
+            round_tag="round14",         # consistent with adni_error_buffer_*_round1
+            device=device,
+            output_dir=OUTPUT_DIR,
+        )
+    print("\nAll done. ANLI error buffers are ready for SRL fine-tuning.")
+if __name__ == "__main__":
+    main()

build_anli_global_error_buffer_round1.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+# ============================
+# INPUT FILES (already created)
+# ============================
+BASE_ANALYSIS_DIR = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\AetherMind_for_Alzheimers_Research\data\claims\analysis"
+ANLI_R1_CSV = os.path.join(BASE_ANALYSIS_DIR, "anli_error_buffer_anli_r1_dev_round14.csv")
+ANLI_R2_CSV = os.path.join(BASE_ANALYSIS_DIR, "anli_error_buffer_anli_r2_dev_round14.csv")
+ANLI_R3_CSV = os.path.join(BASE_ANALYSIS_DIR, "anli_error_buffer_anli_r3_dev_round14.csv")
+# ============================
+# OUTPUT FILES (global ANLI buffer)
+# ============================
+OUT_TRAIN = os.path.join(BASE_ANALYSIS_DIR, "global_error_buffer_anli_round14_train.csv")
+OUT_VAL   = os.path.join(BASE_ANALYSIS_DIR, "global_error_buffer_anli_round14_val.csv")
+RANDOM_SEED = 42
+VAL_RATIO = 0.20   # 80% train / 20% val
+def main():
+    print("============================================================")
+    print("BUILD GLOBAL ANLI ERROR BUFFER (ROUND 1 → SRL SOURCE)")
+    print("============================================================")
+    # 1) Load the three ANLI error CSVs
+    print("\nLoading ANLI error buffers...")
+    df_r1 = pd.read_csv(ANLI_R1_CSV)
+    df_r2 = pd.read_csv(ANLI_R2_CSV)
+    df_r3 = pd.read_csv(ANLI_R3_CSV)
+    print(f"  R1 rows: {len(df_r1)}")
+    print(f"  R2 rows: {len(df_r2)}")
+    print(f"  R3 rows: {len(df_r3)}")
+    # 2) Concatenate
+    df_all = pd.concat([df_r1, df_r2, df_r3], ignore_index=True)
+    print(f"\nTotal ANLI rows (R1+R2+R3): {len(df_all)}")
+    # Sanity: required columns for SRL pipeline
+    required_cols = ["premise", "hypothesis", "true_label_id", "is_error"]
+    missing = [c for c in required_cols if c not in df_all.columns]
+    if missing:
+        raise ValueError(f"Missing required columns in ANLI buffers: {missing}")
+    # 3) Shuffle + split into train/val
+    df_all = df_all.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)
+    train_df, val_df = train_test_split(
+        df_all,
+        test_size=VAL_RATIO,
+        random_state=RANDOM_SEED,
+        shuffle=True,
+        stratify=df_all["true_label_id"],  # keep class balance
+    )
+    print(f"\nTrain size: {len(train_df)}")
+    print(f"Val   size: {len(val_df)}")
+    # 4) Show distributions
+    def show_dist(name, df):
+        print(f"\n{name} - class distribution:")
+        total = len(df)
+        for label_id, label_name in {0: "entailment", 1: "neutral", 2: "contradiction"}.items():
+            count = (df["true_label_id"] == label_id).sum()
+            print(f"  {label_name}: {count} ({100.0 * count / total:.1f}%)")
+        errors = df["is_error"].sum()
+        print(f"{name} - errors: {errors} ({100.0 * errors / total:.1f}%), correct: {total - errors}")
+    show_dist("TRAIN", train_df)
+    show_dist("VAL", val_df)
+    # 5) Save
+    train_df.to_csv(OUT_TRAIN, index=False, encoding="utf-8")
+    val_df.to_csv(OUT_VAL, index=False, encoding="utf-8")
+    print("\nSaved:")
+    print(f"  Train: {OUT_TRAIN}")
+    print(f"  Val  : {OUT_VAL}")
+    print("\n✅ Global ANLI error buffers are ready for SRL.")
+    print("Use them as input to the SRL buffer rebalance script.")
+if __name__ == "__main__":
+    main()

evaluate_model_hf_only (2).py ADDED Viewed

	@@ -0,0 +1,403 @@

+import os
+import json
+from datetime import datetime
+from pathlib import Path
+import torch
+import numpy as np
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
+from tqdm.auto import tqdm
+# ============================
+# CONFIG
+# ============================
+MODEL_PATH = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\models\student_biomed_kd_fast\adni_srl_round14_smart"
+OUTPUT_DIR = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\evaluation_results\adni_srl_round2_fixed"
+BATCH_SIZE = 64
+MAX_LENGTH = 192
+# HuggingFace datasets
+DATASETS_CONFIG = [
+    ("SNLI", "snli", "test", None),
+    ("MNLI M", "nyu-mll/multi_nli", "validation_matched", None),
+    ("MNLI MM", "nyu-mll/multi_nli", "validation_mismatched", None),
+    ("ANLI R1", "facebook/anli", "test_r1", None),
+    ("ANLI R2", "facebook/anli", "test_r2", None),
+    ("ANLI R3", "facebook/anli", "test_r3", None),
+    ("XNLI", "facebook/xnli", "validation", "en"),
+]
+# Local ADNI NLI JSON files
+ADNI_DATASETS = [
+    ("ADNI Train", r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\AetherMind_for_Alzheimers_Research\data\claims\splits\adni_nli_train.json"),
+    ("ADNI Val",   r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\AetherMind_for_Alzheimers_Research\data\claims\splits\adni_nli_val.json"),
+    ("ADNI Test",  r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\AetherMind_for_Alzheimers_Research\data\claims\splits\adni_nli_test.json"),
+]
+LABEL_NAMES = ["entailment", "neutral", "contradiction"]
+# ============================
+# HELPER FUNCTIONS
+# ============================
+def load_model_and_tokenizer(model_path: str, device: str):
+    print(f"\n{'='*60}")
+    print("Loading Model and Tokenizer")
+    print(f"{'='*60}")
+    print(f"Model: {model_path}")
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForSequenceClassification.from_pretrained(model_path)
+    model.to(device)
+    model.eval()
+    print(f"Device: {device}")
+    print(f"Model loaded successfully!")
+    return tokenizer, model
+def compute_metrics_from_predictions(name, labels, preds):
+    accuracy = accuracy_score(labels, preds)
+    precision, recall, f1, support = precision_recall_fscore_support(
+        labels, preds, average=None, labels=[0, 1, 2], zero_division=0
+    )
+    macro_precision = float(np.mean(precision))
+    macro_recall = float(np.mean(recall))
+    macro_f1 = float(np.mean(f1))
+    conf_matrix = confusion_matrix(labels, preds, labels=[0, 1, 2])
+    print(f"\n{'='*60}")
+    print(f"RESULTS: {name}")
+    print(f"{'='*60}")
+    print(f"Samples: {len(labels)}")
+    print(f"Accuracy:  {accuracy*100:.2f}%")
+    print(f"Macro F1:  {macro_f1*100:.2f}%")
+    print(f"\nPer-Class Performance:")
+    for i, label_name in enumerate(LABEL_NAMES):
+        print(
+            f"  {label_name.upper():13} "
+            f"P: {precision[i]*100:.2f}%  "
+            f"R: {recall[i]*100:.2f}%  "
+            f"F1: {f1[i]*100:.2f}%  (n={support[i]})"
+        )
+    result = {
+        "dataset": name,
+        "accuracy": float(accuracy),
+        "macro_precision": macro_precision,
+        "macro_recall": macro_recall,
+        "macro_f1": macro_f1,
+        "per_class": {
+            LABEL_NAMES[i]: {
+                "precision": float(precision[i]),
+                "recall": float(recall[i]),
+                "f1": float(f1[i]),
+                "support": int(support[i]),
+            }
+            for i in range(3)
+        },
+        "confusion_matrix": conf_matrix.tolist(),
+        "total_samples": len(labels),
+    }
+    return result
+def evaluate_dataset(
+    name: str,
+    hf_name: str,
+    split: str,
+    config: str,
+    tokenizer,
+    model,
+    device: str,
+    batch_size: int,
+    max_length: int,
+):
+    print(f"\n{'='*60}")
+    print(f"Loading {name} Dataset")
+    print(f"{'='*60}")
+    if config:
+        dataset = load_dataset(hf_name, config, split=split, trust_remote_code=False)
+    else:
+        dataset = load_dataset(hf_name, split=split, trust_remote_code=False)
+    if "label" in dataset.column_names:
+        dataset = dataset.filter(lambda ex: ex["label"] != -1)
+    print(f"✅ Loaded {len(dataset)} valid examples")
+    premises = [str(ex["premise"]) for ex in dataset]
+    hypotheses = [str(ex["hypothesis"]) for ex in dataset]
+    labels = [int(ex["label"]) for ex in dataset]
+    label_counts = {0: 0, 1: 0, 2: 0}
+    for lab in labels:
+        label_counts[lab] = label_counts.get(lab, 0) + 1
+    print(f"Label distribution: {label_counts}")
+    print(f"\n{'='*60}")
+    print(f"Evaluating: {name}")
+    print(f"{'='*60}")
+    all_preds = []
+    num_batches = (len(labels) + batch_size - 1) // batch_size
+    with torch.no_grad():
+        for i in tqdm(range(0, len(labels), batch_size), total=num_batches, desc=f"{name}"):
+            batch_premises = premises[i:i+batch_size]
+            batch_hypotheses = hypotheses[i:i+batch_size]
+            encodings = tokenizer(
+                batch_premises,
+                batch_hypotheses,
+                padding=True,
+                truncation=True,
+                max_length=max_length,
+                return_tensors="pt",
+            ).to(device)
+            outputs = model(**encodings)
+            preds = torch.argmax(outputs.logits, dim=-1).cpu().tolist()
+            all_preds.extend(preds)
+    return compute_metrics_from_predictions(name, labels, all_preds)
+def extract_label(rec):
+    """
+    Robustly extract label as int 0/1/2 from a JSON record.
+    Handles:
+      - rec['label'] as int or string
+      - rec['true_label_id'] as int
+      - rec['gold_label'] as string
+    """
+    mapping = {
+        "entailment": 0,
+        "e": 0,
+        "neutral": 1,
+        "n": 1,
+        "contradiction": 2,
+        "c": 2,
+    }
+    if "label" in rec:
+        v = rec["label"]
+        if isinstance(v, int):
+            return v
+        v_str = str(v).strip().lower()
+        if v_str in mapping:
+            return mapping[v_str]
+        raise ValueError(f"Unknown string label in 'label': {v}")
+    if "true_label_id" in rec:
+        return int(rec["true_label_id"])
+    if "gold_label" in rec:
+        v_str = str(rec["gold_label"]).strip().lower()
+        if v_str in mapping:
+            return mapping[v_str]
+        raise ValueError(f"Unknown string label in 'gold_label': {rec['gold_label']}")
+    raise ValueError(f"Could not extract label from record keys: {list(rec.keys())}")
+def evaluate_local_json_dataset(
+    name: str,
+    json_path: str,
+    tokenizer,
+    model,
+    device: str,
+    batch_size: int,
+    max_length: int,
+):
+    print(f"\n{'='*60}")
+    print(f"Loading {name} (local JSON)")
+    print(f"{'='*60}")
+    print(f"Path: {json_path}")
+    if not os.path.exists(json_path):
+        raise FileNotFoundError(f"JSON file not found: {json_path}")
+    with open(json_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, dict) and "data" in data:
+        records = data["data"]
+    else:
+        records = data
+    premises = []
+    hypotheses = []
+    labels = []
+    for rec in records:
+        premise = rec.get("premise")
+        hypothesis = rec.get("hypothesis")
+        if premise is None or hypothesis is None:
+            raise ValueError("Expected 'premise' and 'hypothesis' keys in ADNI JSON records.")
+        label = extract_label(rec)
+        if label == -1:
+            continue
+        premises.append(str(premise))
+        hypotheses.append(str(hypothesis))
+        labels.append(int(label))
+    print(f"✅ Loaded {len(labels)} valid examples")
+    label_counts = {0: 0, 1: 0, 2: 0}
+    for lab in labels:
+        label_counts[lab] = label_counts.get(lab, 0) + 1
+    print(f"Label distribution: {label_counts}")
+    print(f"\n{'='*60}")
+    print(f"Evaluating: {name}")
+    print(f"{'='*60}")
+    all_preds = []
+    num_batches = (len(labels) + batch_size - 1) // batch_size
+    with torch.no_grad():
+        for i in tqdm(range(0, len(labels), batch_size), total=num_batches, desc=name):
+            batch_premises = premises[i:i + batch_size]
+            batch_hypotheses = hypotheses[i:i + batch_size]
+            encodings = tokenizer(
+                batch_premises,
+                batch_hypotheses,
+                padding=True,
+                truncation=True,
+                max_length=max_length,
+                return_tensors="pt",
+            ).to(device)
+            outputs = model(**encodings)
+            preds = torch.argmax(outputs.logits, dim=-1).cpu().tolist()
+            all_preds.extend(preds)
+    return compute_metrics_from_predictions(name, labels, all_preds)
+def save_results(results: list, output_dir: str, model_path: str):
+    os.makedirs(output_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    model_name = Path(model_path).name
+    json_path = os.path.join(output_dir, f"results_{model_name}_{timestamp}.json")
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2)
+    summary_path = os.path.join(output_dir, f"summary_{model_name}_{timestamp}.txt")
+    with open(summary_path, "w", encoding="utf-8") as f:
+        f.write("="*80 + "\n")
+        f.write("COMPREHENSIVE NLI MODEL EVALUATION SUMMARY\n")
+        f.write("="*80 + "\n")
+        f.write(f"Model: {model_path}\n")
+        f.write(f"Timestamp: {timestamp}\n")
+        f.write("="*80 + "\n\n")
+        for result in results:
+            f.write(f"{result['dataset']}\n")
+            f.write("-" * 40 + "\n")
+            f.write(f"Accuracy:  {result['accuracy']*100:.2f}%\n")
+            f.write(f"Macro F1:  {result['macro_f1']*100:.2f}%\n")
+            f.write(f"Samples:   {result['total_samples']}\n")
+            f.write("\n")
+        f.write("\n" + "="*80 + "\n")
+        f.write("OVERALL STATISTICS\n")
+        f.write("="*80 + "\n")
+        avg_accuracy = np.mean([r['accuracy'] for r in results])
+        avg_f1 = np.mean([r['macro_f1'] for r in results])
+        f.write(f"Average Accuracy: {avg_accuracy*100:.2f}%\n")
+        f.write(f"Average Macro F1: {avg_f1*100:.2f}%\n")
+    print(f"\n✅ Results saved:")
+    print(f"   JSON: {json_path}")
+    print(f"   Summary: {summary_path}")
+    return json_path, summary_path
+# ============================
+# MAIN
+# ============================
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print("="*80)
+    print("COMPREHENSIVE NLI MODEL EVALUATION")
+    print("="*80)
+    print(f"Model: {MODEL_PATH}")
+    all_names = [d[0] for d in DATASETS_CONFIG] + [d[0] for d in ADNI_DATASETS]
+    print(f"Datasets: {', '.join(all_names)}")
+    print("="*80)
+    tokenizer, model = load_model_and_tokenizer(MODEL_PATH, device)
+    all_results = []
+    for name, hf_name, split, config in DATASETS_CONFIG:
+        result = evaluate_dataset(
+            name=name,
+            hf_name=hf_name,
+            split=split,
+            config=config,
+            tokenizer=tokenizer,
+            model=model,
+            device=device,
+            batch_size=BATCH_SIZE,
+            max_length=MAX_LENGTH,
+        )
+        all_results.append(result)
+    for name, path in ADNI_DATASETS:
+        result = evaluate_local_json_dataset(
+            name=name,
+            json_path=path,
+            tokenizer=tokenizer,
+            model=model,
+            device=device,
+            batch_size=BATCH_SIZE,
+            max_length=MAX_LENGTH,
+        )
+        all_results.append(result)
+    save_results(all_results, OUTPUT_DIR, MODEL_PATH)
+    print(f"\n{'='*80}")
+    print("EVALUATION COMPLETE - FINAL SUMMARY")
+    print(f"{'='*80}\n")
+    print(f"{'Dataset':<15} {'Accuracy':<12} {'Macro F1':<12} {'Samples':<10}")
+    print("-" * 50)
+    for result in all_results:
+        print(
+            f"{result['dataset']:<15} "
+            f"{result['accuracy']*100:>6.2f}%      "
+            f"{result['macro_f1']*100:>6.2f}%      "
+            f"{result['total_samples']:>6}"
+        )
+    print("-" * 50)
+    avg_accuracy = np.mean([r['accuracy'] for r in all_results])
+    avg_f1 = np.mean([r['macro_f1'] for r in all_results])
+    print(f"{'AVERAGE':<15} {avg_accuracy*100:>6.2f}%      {avg_f1*100:>6.2f}%")
+    print("="*80)
+if __name__ == "__main__":
+    main()

srl_finetune_round5_smart.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+SRL Round 5 - Smart ANLI Fine-tune (Small, Safe Correction)
+- Base model: best global checkpoint (adni_srl_round3_final)
+- Data: smart ANLI SRL buffer (60% errors / 40% correct, pattern-tagged)
+- Goal: improve ANLI robustness on real failure patterns without hurting SNLI/MNLI/XNLI
+"""
+import os
+os.environ["WANDB_DISABLED"] = "true"
+from dataclasses import dataclass
+from typing import Dict, List, Union
+import pandas as pd
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    Trainer,
+    TrainingArguments,
+)
+# =============================================================================
+# CONFIG
+# =============================================================================
+# Best model so far (global NLI + ADNI)
+BASE_MODEL_PATH = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\models\student_biomed_kd_fast\adni_srl_round13_smart"
+# Smart SRL buffers (ANLI Round 1 error patterns)
+SMART_TRAIN_CSV = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\AetherMind_for_Alzheimers_Research\data\claims\analysis\global_error_buffer_anli_round14_train.csv"
+SMART_VAL_CSV   = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\AetherMind_for_Alzheimers_Research\data\claims\analysis\global_error_buffer_anli_round14_val.csv"
+# Output directory for the new model
+OUTPUT_DIR = r"C:\Users\Sam\OneDrive\AetherMind\AetherMindProject\models\student_biomed_kd_fast\adni_srl_round14_smart"
+# Max sequence length
+MAX_LENGTH = 192
+# Training hyper-parameters (small, safe SRL step)
+NUM_EPOCHS = 1
+BATCH_SIZE = 16
+LEARNING_RATE = 2e-6
+# Class weights (E, N, C) – mild bias towards entailment and contradiction
+CLASS_WEIGHTS = torch.tensor([1.5, 1.0, 1.3], dtype=torch.float32)
+# Error vs correct weighting
+ERROR_WEIGHT = 2.0   # errors * 2.0, correct * 1.0
+# Seed
+SEED = 42
+# =============================================================================
+# DATASET
+# =============================================================================
+class NLIDataset(torch.utils.data.Dataset):
+    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_length: int = 128):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        # Expect columns: premise, hypothesis, true_label_id, is_error
+        premises = df["premise"].astype(str).tolist()
+        hypotheses = df["hypothesis"].astype(str).tolist()
+        labels = df["true_label_id"].astype(int).tolist()
+        is_error = df["is_error"].astype(int).tolist()
+        error_weights = [ERROR_WEIGHT if e == 1 else 1.0 for e in is_error]
+        encodings = tokenizer(
+            premises,
+            hypotheses,
+            truncation=True,
+            padding="max_length",
+            max_length=max_length,
+        )
+        self.input_ids = torch.tensor(encodings["input_ids"], dtype=torch.long)
+        self.attention_mask = torch.tensor(encodings["attention_mask"], dtype=torch.long)
+        if "token_type_ids" in encodings:
+            self.token_type_ids = torch.tensor(encodings["token_type_ids"], dtype=torch.long)
+        else:
+            self.token_type_ids = None
+        self.labels = torch.tensor(labels, dtype=torch.long)
+        self.error_weights = torch.tensor(error_weights, dtype=torch.float32)
+    def __len__(self):
+        return self.labels.size(0)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        item = {
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.labels[idx],
+            "error_weight": self.error_weights[idx],
+        }
+        if self.token_type_ids is not None:
+            item["token_type_ids"] = self.token_type_ids[idx]
+        return item
+@dataclass
+class DataCollatorWithWeights:
+    """
+    Simple collator: all sequences already padded to max_length.
+    Just stacks tensors and keeps error_weight.
+    """
+    def __call__(self, features: List[Dict[str, Union[torch.Tensor, int, float]]]) -> Dict[str, torch.Tensor]:
+        batch: Dict[str, torch.Tensor] = {}
+        keys = features[0].keys()
+        for key in keys:
+            batch[key] = torch.stack([f[key] for f in features])
+        return batch
+# =============================================================================
+# TRAINER WITH CLASS + ERROR WEIGHTED LOSS
+# =============================================================================
+class ClassAndErrorWeightedTrainer(Trainer):
+    def __init__(self, *args, class_weights: torch.Tensor = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.class_weights = class_weights
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        labels = inputs.pop("labels")
+        error_weight = inputs.pop("error_weight", None)
+        outputs = model(**inputs)
+        logits = outputs.logits
+        # Move class weights to correct device
+        cw = self.class_weights.to(logits.device) if self.class_weights is not None else None
+        loss_fct = nn.CrossEntropyLoss(weight=cw, reduction="none")
+        per_sample_loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+        if error_weight is not None:
+            ew = error_weight.to(per_sample_loss.device).view(-1)
+            per_sample_loss = per_sample_loss * ew
+        loss = per_sample_loss.mean()
+        if return_outputs:
+            return loss, outputs
+        return loss
+# =============================================================================
+# METRICS
+# =============================================================================
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    labels = labels.astype(int)
+    preds = preds.astype(int)
+    acc = (preds == labels).mean()
+    # Per-class metrics
+    num_classes = 3
+    f1s = []
+    recalls = []
+    for cls in range(num_classes):
+        tp = np.logical_and(preds == cls, labels == cls).sum()
+        fp = np.logical_and(preds == cls, labels != cls).sum()
+        fn = np.logical_and(preds != cls, labels == cls).sum()
+        prec = tp / (tp + fp + 1e-8)
+        rec = tp / (tp + fn + 1e-8)
+        f1 = 2 * prec * rec / (prec + rec + 1e-8)
+        f1s.append(f1)
+        recalls.append(rec)
+    macro_f1 = float(np.mean(f1s))
+    return {
+        "accuracy": float(acc),
+        "macro_f1": macro_f1,
+        "entailment_recall": float(recalls[0]),
+        "neutral_recall": float(recalls[1]),
+        "contradiction_recall": float(recalls[2]),
+    }
+# =============================================================================
+# MAIN
+# =============================================================================
+def main():
+    torch.manual_seed(SEED)
+    np.random.seed(SEED)
+    print("=" * 80)
+    print("SRL ROUND 5 - SMART ANLI FINE-TUNE")
+    print("=" * 80)
+    print(f"Base model      : {BASE_MODEL_PATH}")
+    print(f"Train CSV (SRL) : {SMART_TRAIN_CSV}")
+    print(f"Val CSV (SRL)   : {SMART_VAL_CSV}")
+    print(f"Output dir      : {OUTPUT_DIR}")
+    print("=" * 80)
+    # ---------------------------------------------------------
+    # Load tokenizer + model
+    # ---------------------------------------------------------
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
+    model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL_PATH)
+    # ---------------------------------------------------------
+    # Load SRL buffers
+    # ---------------------------------------------------------
+    train_df = pd.read_csv(SMART_TRAIN_CSV)
+    val_df = pd.read_csv(SMART_VAL_CSV)
+    print("\nSMART SRL TRAIN BUFFER")
+    print("----------------------")
+    print(f"Rows: {len(train_df)}")
+    print(train_df["true_label_id"].value_counts(normalize=True).sort_index())
+    print("\nSMART SRL VAL BUFFER")
+    print("--------------------")
+    print(f"Rows: {len(val_df)}")
+    print(val_df["true_label_id"].value_counts(normalize=True).sort_index())
+    # ---------------------------------------------------------
+    # Build datasets
+    # ---------------------------------------------------------
+    train_dataset = NLIDataset(train_df, tokenizer, max_length=MAX_LENGTH)
+    val_dataset = NLIDataset(val_df, tokenizer, max_length=MAX_LENGTH)
+    # ---------------------------------------------------------
+    # Training args (SMALL, SAFE)
+    # ---------------------------------------------------------
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        overwrite_output_dir=True,
+        num_train_epochs=NUM_EPOCHS,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        learning_rate=LEARNING_RATE,
+        weight_decay=0.01,
+        logging_steps=50,
+        eval_strategy="epoch",      # same as your other SRL scripts
+        save_strategy="epoch",
+        save_total_limit=2,
+        load_best_model_at_end=True,
+        metric_for_best_model="macro_f1",
+        remove_unused_columns=False,
+        report_to=[],
+    )
+    data_collator = DataCollatorWithWeights()
+    trainer = ClassAndErrorWeightedTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+        class_weights=CLASS_WEIGHTS,
+    )
+    # ---------------------------------------------------------
+    # Train
+    # ---------------------------------------------------------
+    print("\nStarting SRL Round 5 (smart ANLI fine-tune)...")
+    trainer.train()
+    print("\nFinal evaluation on SRL val buffer:")
+    metrics = trainer.evaluate()
+    for k, v in metrics.items():
+        print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")
+    # ---------------------------------------------------------
+    # Save
+    # ---------------------------------------------------------
+    print("\nSaving final SRL Round 5 model...")
+    trainer.save_model(OUTPUT_DIR)
+    tokenizer.save_pretrained(OUTPUT_DIR)
+    print("\n" + "=" * 80)
+    print("✅ SRL ROUND 5 SMART FINE-TUNE COMPLETE")
+    print("=" * 80)
+    print(f"Model saved to: {OUTPUT_DIR}")
+    print("Next: run evaluate_model_hf_only.py with this path as MODEL.")
+    print("=" * 80)
+if __name__ == "__main__":
+    main()