| import pandas as pd |
| import numpy as np |
| from pathlib import Path |
| import os |
|
|
| def load_dataset(file_path, encoding='utf-8'): |
| """Load dataset with fallback encodings""" |
| encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252'] |
| |
| if encoding != 'utf-8': |
| encodings.insert(0, encoding) |
| |
| for enc in encodings: |
| try: |
| return pd.read_csv(file_path, encoding=enc) |
| except UnicodeDecodeError: |
| continue |
| except Exception as e: |
| print(f"Error with {enc}: {str(e)}") |
| continue |
| |
| raise ValueError(f"Could not read {file_path} with any encoding") |
|
|
| def merge_english_comments(output_file=None): |
| """Merge English comments from multiple datasets""" |
| |
| |
| multilingual_file = 'dataset/raw/MULTILINGUAL_TOXIC_DATASET_347K_7LANG.csv' |
| english_file = 'dataset/raw/english-comments-cleaned.csv' |
| |
| print("\nProcessing multilingual dataset...") |
| multi_df = load_dataset(multilingual_file) |
| |
| multi_df = multi_df[multi_df['lang'] == 'en'].copy() |
| print(f"Found {len(multi_df):,} English comments in multilingual dataset") |
| |
| print("\nProcessing English cleaned dataset...") |
| eng_df = load_dataset(english_file) |
| print(f"Found {len(eng_df):,} comments in English dataset") |
| |
| |
| required_cols = ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] |
| |
| |
| if 'comment_text' not in multi_df.columns and 'text' in multi_df.columns: |
| multi_df['comment_text'] = multi_df['text'] |
| |
| |
| for col in required_cols[1:]: |
| if col not in multi_df.columns: |
| multi_df[col] = 0 |
| if col not in eng_df.columns: |
| eng_df[col] = 0 |
| |
| |
| multi_df = multi_df[required_cols] |
| eng_df = eng_df[required_cols] |
| |
| |
| print("\nMerging datasets...") |
| merged_df = pd.concat([multi_df, eng_df], ignore_index=True) |
| initial_count = len(merged_df) |
| print(f"Initial merged size: {initial_count:,} comments") |
| |
| |
| merged_df = merged_df.drop_duplicates(subset=['comment_text'], keep='first') |
| final_count = len(merged_df) |
| print(f"After removing duplicates: {final_count:,} comments") |
| print(f"Removed {initial_count - final_count:,} duplicates") |
| |
| |
| print("\nToxicity distribution in final dataset:") |
| for col in required_cols[1:]: |
| toxic_count = (merged_df[col] > 0).sum() |
| print(f"{col.replace('_', ' ').title()}: {toxic_count:,} ({toxic_count/final_count*100:.1f}%)") |
| |
| |
| if output_file is None: |
| output_file = "dataset/processed/english_merged.csv" |
| |
| os.makedirs(os.path.dirname(output_file), exist_ok=True) |
| print(f"\nSaving merged dataset to: {output_file}") |
| merged_df.to_csv(output_file, index=False) |
| print(f"File size: {Path(output_file).stat().st_size / (1024*1024):.1f} MB") |
| |
| return merged_df |
|
|
| if __name__ == "__main__": |
| output_file = "dataset/processed/english_merged.csv" |
| merged_df = merge_english_comments(output_file) |