import pandas as pd

input_path = "data/processed/url_dataset_cleaned.csv"
output_path = "data/processed/url_dataset_balanced.csv"

print("Loading dataset...")
df = pd.read_csv(input_path)
print(f"Total rows: {len(df):,}")

label_counts = df["label"].value_counts()
print(f"Label 0: {label_counts[0]:,}  |  Label 1: {label_counts[1]:,}")

minority_count = label_counts.min()
minority_label = label_counts.idxmin()
majority_label = label_counts.idxmax()

print(f"\nBalancing to {minority_count:,} per label (matching label {minority_label})...")

df_minority = df[df["label"] == minority_label]
df_majority = df[df["label"] == majority_label].sample(n=minority_count, random_state=42)

df_balanced = pd.concat([df_minority, df_majority]).sample(frac=1, random_state=42).reset_index(drop=True)

label_counts_new = df_balanced["label"].value_counts().sort_index()
print(f"\nBalanced dataset:")
for label, count in label_counts_new.items():
    print(f"  Label {label}: {count:,}")

df_balanced.to_csv(output_path, index=False)
print(f"\nSaved {len(df_balanced):,} rows to {output_path}")