"""
slim_dataset.py — Reduce goEmotionDataset.csv to only the rows actually used
by emotion_analyzer.py, eliminating the Git LFS requirement.

emotion_analyzer.py caps centroid building at MAX_SAMPLES=300 per emotion (seed 42).
This script pre-applies that same sampling and takes the union of all selected rows,
so the CSV never needs more than ~8,400 rows instead of 57,732.

Run once from the repo root:
    python scripts/slim_dataset.py
"""
import os
import random
import pandas as pd

MAX_SAMPLES = 300
SEED = 42

EMOTION_LABELS = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
    "joy", "love", "nervousness", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise", "neutral",
]

root = os.path.join(os.path.dirname(__file__), "..")
csv_path = os.path.join(root, "goEmotionDataset.csv")

print(f"Reading {csv_path} ...")
df = pd.read_csv(csv_path)
original_rows = len(df)
print(f"Original: {original_rows} rows, {os.path.getsize(csv_path) / 1e6:.2f} MB")

keep_indices: set[int] = set()
random.seed(SEED)
for emotion in EMOTION_LABELS:
    if emotion not in df.columns:
        print(f"  [warn] column '{emotion}' not found — skipping")
        continue
    indices = df.index[df[emotion] == 1].tolist()
    if len(indices) > MAX_SAMPLES:
        indices = random.sample(indices, MAX_SAMPLES)
    keep_indices.update(indices)
    print(f"  {emotion}: kept {len(indices)} rows")

slim = df.loc[sorted(keep_indices)].reset_index(drop=True)
slim.to_csv(csv_path, index=False)

new_size = os.path.getsize(csv_path)
print(f"\nSlim dataset: {len(slim)} rows (was {original_rows}, "
      f"{len(slim)/original_rows*100:.1f}% kept)")
print(f"File size: {new_size / 1e6:.2f} MB")

if new_size < 10 * 1e6:
    print("OK: Under 10 MB - safe to commit without Git LFS")
else:
    print("WARN: Still over 10 MB - increase MAX_SAMPLES cutoff or drop metadata columns")