Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| input_path = "data/processed/url_dataset_cleaned.csv" | |
| output_path = "data/processed/url_dataset_balanced.csv" | |
| print("Loading dataset...") | |
| df = pd.read_csv(input_path) | |
| print(f"Total rows: {len(df):,}") | |
| label_counts = df["label"].value_counts() | |
| print(f"Label 0: {label_counts[0]:,} | Label 1: {label_counts[1]:,}") | |
| minority_count = label_counts.min() | |
| minority_label = label_counts.idxmin() | |
| majority_label = label_counts.idxmax() | |
| print(f"\nBalancing to {minority_count:,} per label (matching label {minority_label})...") | |
| df_minority = df[df["label"] == minority_label] | |
| df_majority = df[df["label"] == majority_label].sample(n=minority_count, random_state=42) | |
| df_balanced = pd.concat([df_minority, df_majority]).sample(frac=1, random_state=42).reset_index(drop=True) | |
| label_counts_new = df_balanced["label"].value_counts().sort_index() | |
| print(f"\nBalanced dataset:") | |
| for label, count in label_counts_new.items(): | |
| print(f" Label {label}: {count:,}") | |
| df_balanced.to_csv(output_path, index=False) | |
| print(f"\nSaved {len(df_balanced):,} rows to {output_path}") | |