Spaces:
Runtime error
Runtime error
File size: 1,115 Bytes
2cc7f91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | import pandas as pd
input_path = "data/processed/url_dataset_cleaned.csv"
output_path = "data/processed/url_dataset_balanced.csv"
print("Loading dataset...")
df = pd.read_csv(input_path)
print(f"Total rows: {len(df):,}")
label_counts = df["label"].value_counts()
print(f"Label 0: {label_counts[0]:,} | Label 1: {label_counts[1]:,}")
minority_count = label_counts.min()
minority_label = label_counts.idxmin()
majority_label = label_counts.idxmax()
print(f"\nBalancing to {minority_count:,} per label (matching label {minority_label})...")
df_minority = df[df["label"] == minority_label]
df_majority = df[df["label"] == majority_label].sample(n=minority_count, random_state=42)
df_balanced = pd.concat([df_minority, df_majority]).sample(frac=1, random_state=42).reset_index(drop=True)
label_counts_new = df_balanced["label"].value_counts().sort_index()
print(f"\nBalanced dataset:")
for label, count in label_counts_new.items():
print(f" Label {label}: {count:,}")
df_balanced.to_csv(output_path, index=False)
print(f"\nSaved {len(df_balanced):,} rows to {output_path}")
|