| import pandas as pd |
| from pathlib import Path |
| import logging |
| from datetime import datetime |
|
|
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format='%(asctime)s | %(message)s' |
| ) |
| logger = logging.getLogger(__name__) |
|
|
| def merge_datasets(): |
| """Merge augmented threat dataset with main dataset""" |
| try: |
| |
| logger.info("Loading main dataset...") |
| main_df = pd.read_csv("dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_FINAL.csv") |
| logger.info(f"Main dataset: {len(main_df):,} rows") |
| |
| |
| augmented_path = Path("dataset/augmented") |
| latest_augmented = max(augmented_path.glob("threat_augmented_*.csv")) |
| logger.info(f"Loading augmented dataset: {latest_augmented.name}") |
| aug_df = pd.read_csv(latest_augmented) |
| logger.info(f"Augmented dataset: {len(aug_df):,} rows") |
| |
| |
| logger.info("Standardizing columns...") |
| aug_df_standardized = pd.DataFrame({ |
| 'comment_text': aug_df['text'], |
| 'toxic': 1, |
| 'severe_toxic': 0, |
| 'obscene': 0, |
| 'threat': 1, |
| 'insult': 0, |
| 'identity_hate': 0, |
| 'lang': 'en' |
| }) |
| |
| |
| logger.info("Checking for duplicates...") |
| combined_texts = pd.concat([main_df['comment_text'], aug_df_standardized['comment_text']]) |
| duplicates = combined_texts.duplicated(keep='first') |
| duplicate_count = duplicates[len(main_df):].sum() |
| logger.info(f"Found {duplicate_count} duplicates in augmented data") |
| |
| |
| aug_df_standardized = aug_df_standardized[~duplicates[len(main_df):].values] |
| logger.info(f"Augmented dataset after duplicate removal: {len(aug_df_standardized):,} rows") |
| |
| |
| merged_df = pd.concat([main_df, aug_df_standardized], ignore_index=True) |
| logger.info(f"Final merged dataset: {len(merged_df):,} rows") |
| |
| |
| output_path = f"dataset/processed/MULTILINGUAL_TOXIC_DATASET_AUGMENTED.csv" |
| merged_df.to_csv(output_path, index=False) |
| logger.info(f"Saved merged dataset to: {output_path}") |
| |
| |
| logger.info("\nDataset Statistics:") |
| logger.info(f"Original samples: {len(main_df):,}") |
| logger.info(f"Added threat samples: {len(aug_df_standardized):,}") |
| logger.info(f"Total samples: {len(merged_df):,}") |
| logger.info(f"Threat samples in final dataset: {merged_df['threat'].sum():,}") |
| |
| return merged_df |
| |
| except Exception as e: |
| logger.error(f"Error merging datasets: {str(e)}") |
| raise |
|
|
| if __name__ == "__main__": |
| merged_df = merge_datasets() |