depscreen / ml /scripts /apply_confident_learning.py
halsabbah's picture
style: apply ruff format to pass CI format check
95974bc
"""
Apply confident learning findings to clean training data.
Strategy:
1. HIGH confidence mislabels (>0.85): relabel to model's prediction
EXCEPT negation cases ("I'm NOT suicidal") — these get removed
2. MEDIUM confidence (0.7-0.85): remove (too ambiguous to relabel)
3. LOW confidence (<0.7): keep as-is (model isn't sure enough)
Also handles:
- Negation detection: sentences that discuss symptoms in the negative
- Removes samples that are too ambiguous for any label
Usage:
python apply_confident_learning.py
"""
import json
import logging
import re
from pathlib import Path
import pandas as pd
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Patterns that indicate negation — discussing a symptom but NOT experiencing it
NEGATION_PATTERNS = [
r"\b(not|never|no longer|don'?t|doesn'?t|isn'?t|wasn'?t|aren'?t|haven'?t|hasn'?t|won'?t|can'?t)\b.{0,20}\b(suicid|kill|die|death|depress|sad|cry|sleep|tired|energy|appetite|focus|concentrat|worthless|guilt|burden)",
r"\b(suicid|kill|die|death|depress|sad|cry|sleep|tired|energy|appetite|focus|concentrat|worthless|guilt|burden).{0,20}\b(not|never|no longer|don'?t|doesn'?t|isn'?t|wasn'?t|aren'?t|haven'?t|hasn'?t|won'?t|can'?t)\b",
r"\b(before|used to|in the past|years ago|stopped|quit|no more)\b.{0,30}\b(suicid|depress|cry|sad)",
r"\bI'?m not\b",
r"\bI never\b",
r"\bI don'?t (have|feel|think|want)\b",
]
def is_negation(text: str) -> bool:
"""Check if text discusses a symptom in the negative/past tense."""
text_lower = text.lower()
return any(re.search(pattern, text_lower) for pattern in NEGATION_PATTERNS)
def main():
base_dir = Path(__file__).parent.parent
cleaned_dir = base_dir / "data" / "redsm5" / "cleaned"
output_dir = base_dir / "data" / "redsm5" / "cleaned_v2"
output_dir.mkdir(parents=True, exist_ok=True)
# Load current cleaned data
train = pd.read_csv(cleaned_dir / "train.csv")
val = pd.read_csv(cleaned_dir / "val.csv")
test = pd.read_csv(cleaned_dir / "test.csv")
with open(cleaned_dir / "metadata.json") as f:
metadata = json.load(f)
label_map = metadata["label_map"]
# Load confident learning suspects
with open(cleaned_dir / "confident_learning_suspects.json") as f:
suspects = json.load(f)
logger.info(f"Original cleaned training samples: {len(train)}")
logger.info(f"Total suspects: {len(suspects)}")
# Categorize suspects
relabeled = 0
removed = 0
kept = 0
indices_to_drop = []
relabel_map = {} # index → new label
for s in suspects:
idx = int(s["index"])
conf = float(s["pred_conf"])
true_label = s["true_label"]
pred_label = s["pred_label"]
text = train.iloc[idx]["clean_text"] if idx < len(train) else ""
if conf > 0.85:
# High confidence — model is very sure this is mislabeled
if is_negation(text):
# Negation case: "I'm NOT suicidal" — remove, it confuses the model
indices_to_drop.append(idx)
removed += 1
else:
# Genuine mislabel — relabel to model's prediction
relabel_map[idx] = pred_label
relabeled += 1
elif conf > 0.7:
# Medium confidence — too ambiguous, remove
indices_to_drop.append(idx)
removed += 1
else:
# Low confidence — keep original label
kept += 1
logger.info("\nActions:")
logger.info(f" Relabeled (conf >0.85, not negation): {relabeled}")
logger.info(f" Removed (negation or ambiguous 0.7-0.85): {removed}")
logger.info(f" Kept as-is (conf <0.7): {kept}")
# Apply relabeling
for idx, new_label in relabel_map.items():
if idx < len(train):
train.at[idx, "label"] = new_label
train.at[idx, "label_id"] = label_map[new_label]
# Apply removal
train = train.drop(index=[i for i in indices_to_drop if i < len(train)])
train = train.reset_index(drop=True)
logger.info(f"\nAfter confident learning: {len(train)} training samples")
# Recompute class weights
from preprocess_redsm5 import SYMPTOM_LABELS, SYMPTOM_READABLE
counts = train["label_id"].value_counts().sort_index()
total = len(train)
n_classes = len(SYMPTOM_LABELS)
class_weights = {}
for label_id, count in counts.items():
class_weights[int(label_id)] = total / (n_classes * count)
# Save
train.to_csv(output_dir / "train.csv", index=False)
val.to_csv(output_dir / "val.csv", index=False)
test.to_csv(output_dir / "test.csv", index=False)
new_metadata = {
"label_map": label_map,
"label_readable": SYMPTOM_READABLE,
"class_weights": class_weights,
"num_classes": n_classes,
"total_samples": len(train) + len(val) + len(test),
"train_samples": len(train),
"val_samples": len(val),
"test_samples": len(test),
"confident_learning": {
"relabeled": relabeled,
"removed": removed,
"kept": kept,
"threshold_high": 0.85,
"threshold_medium": 0.7,
},
"label_distribution": {
"train": train["label"].value_counts().to_dict(),
},
}
with open(output_dir / "metadata.json", "w") as f:
json.dump(new_metadata, f, indent=2)
# Report
print(f"\n{'=' * 60}")
print("CONFIDENT LEARNING APPLIED")
print(f"{'=' * 60}")
print("Original cleaned: 1514")
print(f"After CL: {len(train)}")
print(f" Relabeled: {relabeled}")
print(f" Removed: {removed}")
print(f" Kept: {kept}")
print("\nNew class distribution:")
for label, count in train["label"].value_counts().sort_values().items():
print(f" {label:<22} {count:>4}")
print(f"\nSaved to: {output_dir}")
if __name__ == "__main__":
main()