| import pandas as pd |
| import numpy as np |
| from pathlib import Path |
| import os |
|
|
| def clean_toxicity_labels(input_file, output_file=None): |
| """Clean toxicity labels by converting fractional values to binary using ceiling""" |
| print(f"\nReading dataset: {input_file}") |
| df = pd.read_csv(input_file) |
| |
| |
| total_rows = len(df) |
| print(f"\nInitial dataset size: {total_rows:,} comments") |
| |
| |
| toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] |
| |
| |
| print("\nInitial value distribution:") |
| print("-" * 50) |
| for col in toxicity_cols: |
| unique_vals = df[col].value_counts().sort_index() |
| print(f"\n{col.replace('_', ' ').title()}:") |
| for val, count in unique_vals.items(): |
| print(f" {val}: {count:,} comments") |
| |
| |
| print("\nCleaning labels...") |
| for col in toxicity_cols: |
| |
| unique_before = df[col].nunique() |
| non_binary = df[~df[col].isin([0, 1])][col].unique() |
| |
| if len(non_binary) > 0: |
| print(f"\n{col.replace('_', ' ').title()}:") |
| print(f" Found {len(non_binary)} non-binary values: {sorted(non_binary)}") |
| |
| |
| df[col] = np.ceil(df[col]).clip(0, 1).astype(int) |
| |
| |
| unique_after = df[col].nunique() |
| print(f" Unique values before: {unique_before}") |
| print(f" Unique values after: {unique_after}") |
| |
| |
| print("\nFinal value distribution:") |
| print("-" * 50) |
| for col in toxicity_cols: |
| value_counts = df[col].value_counts().sort_index() |
| total = len(df) |
| print(f"\n{col.replace('_', ' ').title()}:") |
| for val, count in value_counts.items(): |
| percentage = (count / total) * 100 |
| print(f" {val}: {count:,} comments ({percentage:.2f}%)") |
| |
| |
| if output_file is None: |
| base, ext = os.path.splitext(input_file) |
| output_file = f"{base}_cleaned{ext}" |
| |
| os.makedirs(os.path.dirname(output_file), exist_ok=True) |
| print(f"\nSaving cleaned dataset to: {output_file}") |
| df.to_csv(output_file, index=False) |
| print(f"File size: {Path(output_file).stat().st_size / (1024*1024):.1f} MB") |
| |
| return df |
|
|
| if __name__ == "__main__": |
| input_file = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_360K_7LANG.csv" |
| output_file = "dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_binary.csv" |
| |
| cleaned_df = clean_toxicity_labels(input_file, output_file) |