""" Apply confident learning findings to clean training data. Strategy: 1. HIGH confidence mislabels (>0.85): relabel to model's prediction EXCEPT negation cases ("I'm NOT suicidal") — these get removed 2. MEDIUM confidence (0.7-0.85): remove (too ambiguous to relabel) 3. LOW confidence (<0.7): keep as-is (model isn't sure enough) Also handles: - Negation detection: sentences that discuss symptoms in the negative - Removes samples that are too ambiguous for any label Usage: python apply_confident_learning.py """ import json import logging import re from pathlib import Path import pandas as pd logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Patterns that indicate negation — discussing a symptom but NOT experiencing it NEGATION_PATTERNS = [ r"\b(not|never|no longer|don'?t|doesn'?t|isn'?t|wasn'?t|aren'?t|haven'?t|hasn'?t|won'?t|can'?t)\b.{0,20}\b(suicid|kill|die|death|depress|sad|cry|sleep|tired|energy|appetite|focus|concentrat|worthless|guilt|burden)", r"\b(suicid|kill|die|death|depress|sad|cry|sleep|tired|energy|appetite|focus|concentrat|worthless|guilt|burden).{0,20}\b(not|never|no longer|don'?t|doesn'?t|isn'?t|wasn'?t|aren'?t|haven'?t|hasn'?t|won'?t|can'?t)\b", r"\b(before|used to|in the past|years ago|stopped|quit|no more)\b.{0,30}\b(suicid|depress|cry|sad)", r"\bI'?m not\b", r"\bI never\b", r"\bI don'?t (have|feel|think|want)\b", ] def is_negation(text: str) -> bool: """Check if text discusses a symptom in the negative/past tense.""" text_lower = text.lower() return any(re.search(pattern, text_lower) for pattern in NEGATION_PATTERNS) def main(): base_dir = Path(__file__).parent.parent cleaned_dir = base_dir / "data" / "redsm5" / "cleaned" output_dir = base_dir / "data" / "redsm5" / "cleaned_v2" output_dir.mkdir(parents=True, exist_ok=True) # Load current cleaned data train = pd.read_csv(cleaned_dir / "train.csv") val = pd.read_csv(cleaned_dir / "val.csv") test = pd.read_csv(cleaned_dir / "test.csv") with open(cleaned_dir / "metadata.json") as f: metadata = json.load(f) label_map = metadata["label_map"] # Load confident learning suspects with open(cleaned_dir / "confident_learning_suspects.json") as f: suspects = json.load(f) logger.info(f"Original cleaned training samples: {len(train)}") logger.info(f"Total suspects: {len(suspects)}") # Categorize suspects relabeled = 0 removed = 0 kept = 0 indices_to_drop = [] relabel_map = {} # index → new label for s in suspects: idx = int(s["index"]) conf = float(s["pred_conf"]) true_label = s["true_label"] pred_label = s["pred_label"] text = train.iloc[idx]["clean_text"] if idx < len(train) else "" if conf > 0.85: # High confidence — model is very sure this is mislabeled if is_negation(text): # Negation case: "I'm NOT suicidal" — remove, it confuses the model indices_to_drop.append(idx) removed += 1 else: # Genuine mislabel — relabel to model's prediction relabel_map[idx] = pred_label relabeled += 1 elif conf > 0.7: # Medium confidence — too ambiguous, remove indices_to_drop.append(idx) removed += 1 else: # Low confidence — keep original label kept += 1 logger.info("\nActions:") logger.info(f" Relabeled (conf >0.85, not negation): {relabeled}") logger.info(f" Removed (negation or ambiguous 0.7-0.85): {removed}") logger.info(f" Kept as-is (conf <0.7): {kept}") # Apply relabeling for idx, new_label in relabel_map.items(): if idx < len(train): train.at[idx, "label"] = new_label train.at[idx, "label_id"] = label_map[new_label] # Apply removal train = train.drop(index=[i for i in indices_to_drop if i < len(train)]) train = train.reset_index(drop=True) logger.info(f"\nAfter confident learning: {len(train)} training samples") # Recompute class weights from preprocess_redsm5 import SYMPTOM_LABELS, SYMPTOM_READABLE counts = train["label_id"].value_counts().sort_index() total = len(train) n_classes = len(SYMPTOM_LABELS) class_weights = {} for label_id, count in counts.items(): class_weights[int(label_id)] = total / (n_classes * count) # Save train.to_csv(output_dir / "train.csv", index=False) val.to_csv(output_dir / "val.csv", index=False) test.to_csv(output_dir / "test.csv", index=False) new_metadata = { "label_map": label_map, "label_readable": SYMPTOM_READABLE, "class_weights": class_weights, "num_classes": n_classes, "total_samples": len(train) + len(val) + len(test), "train_samples": len(train), "val_samples": len(val), "test_samples": len(test), "confident_learning": { "relabeled": relabeled, "removed": removed, "kept": kept, "threshold_high": 0.85, "threshold_medium": 0.7, }, "label_distribution": { "train": train["label"].value_counts().to_dict(), }, } with open(output_dir / "metadata.json", "w") as f: json.dump(new_metadata, f, indent=2) # Report print(f"\n{'=' * 60}") print("CONFIDENT LEARNING APPLIED") print(f"{'=' * 60}") print("Original cleaned: 1514") print(f"After CL: {len(train)}") print(f" Relabeled: {relabeled}") print(f" Removed: {removed}") print(f" Kept: {kept}") print("\nNew class distribution:") for label, count in train["label"].value_counts().sort_values().items(): print(f" {label:<22} {count:>4}") print(f"\nSaved to: {output_dir}") if __name__ == "__main__": main()