import pandas as pd input_path = "data/processed/url_dataset_cleaned.csv" output_path = "data/processed/url_dataset_balanced.csv" print("Loading dataset...") df = pd.read_csv(input_path) print(f"Total rows: {len(df):,}") label_counts = df["label"].value_counts() print(f"Label 0: {label_counts[0]:,} | Label 1: {label_counts[1]:,}") minority_count = label_counts.min() minority_label = label_counts.idxmin() majority_label = label_counts.idxmax() print(f"\nBalancing to {minority_count:,} per label (matching label {minority_label})...") df_minority = df[df["label"] == minority_label] df_majority = df[df["label"] == majority_label].sample(n=minority_count, random_state=42) df_balanced = pd.concat([df_minority, df_majority]).sample(frac=1, random_state=42).reset_index(drop=True) label_counts_new = df_balanced["label"].value_counts().sort_index() print(f"\nBalanced dataset:") for label, count in label_counts_new.items(): print(f" Label {label}: {count:,}") df_balanced.to_csv(output_path, index=False) print(f"\nSaved {len(df_balanced):,} rows to {output_path}")