| import pandas as pd |
| from pathlib import Path |
| import sys |
| from tqdm import tqdm |
|
|
| def remove_english_comments(input_path, output_path=None): |
| """Remove English comments from a dataset with progress tracking""" |
| print(f"\nReading input file: {input_path}") |
| |
| |
| if output_path is None: |
| output_path = str(Path(input_path).with_suffix('').with_name(f"{Path(input_path).stem}_non_english.csv")) |
| |
| try: |
| |
| df = pd.read_csv(input_path, encoding='utf-8') |
| total_rows = len(df) |
| |
| print(f"\nDataset Info:") |
| print(f"Initial Rows: {total_rows:,}") |
| print(f"Columns: {', '.join(df.columns)}") |
| |
| |
| print("\nFiltering out English comments...") |
| non_english_df = df[df['lang'] != 'en'] |
| |
| |
| print(f"\nSaving to: {output_path}") |
| non_english_df.to_csv(output_path, index=False, encoding='utf-8') |
| |
| |
| english_rows = total_rows - len(non_english_df) |
| |
| print(f"\n✓ Successfully removed English comments") |
| print(f"Initial rows: {total_rows:,}") |
| print(f"Remaining non-English rows: {len(non_english_df):,}") |
| print(f"Removed English rows: {english_rows:,}") |
| print(f"Output file: {output_path}") |
| print(f"Output file size: {Path(output_path).stat().st_size / (1024*1024):.1f} MB") |
| |
| except Exception as e: |
| print(f"\n❌ Error: {str(e)}") |
| sys.exit(1) |
|
|
| if __name__ == "__main__": |
| input_path = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_347k_7LANG.csv" |
| output_path = input_path.replace(".csv", "_non_english.csv") |
| |
| remove_english_comments(input_path, output_path) |