comment-guard-api / merge_datasets.py
tejesh916K's picture
Deploy: Comment Guard API - FastAPI + MuRIL BERT
b8300d6
import pandas as pd
from pathlib import Path
def merge_datasets():
data_dir = Path("data")
custom_words_file = data_dir / "custom_badwords_dataset.xlsx"
main_dataset_file = data_dir / "training_data_telugu-hate.xlsx"
if not custom_words_file.exists():
print(f"Error: {custom_words_file} not found.")
return
if not main_dataset_file.exists():
print(f"Error: {main_dataset_file} not found.")
return
# Load both datasets
print("Loading data...")
custom_df = pd.read_excel(custom_words_file)
main_df = pd.read_excel(main_dataset_file)
print(f"Original main dataset size: {len(main_df)}")
print(f"Custom badwords size: {len(custom_df)}")
# Identify column names in main_dataset (usually text/comment and label/category)
# Based on kaggle_model script, we know text could be 'text' or 'comment'
text_col_main = next((c for c in main_df.columns if str(c).lower() in ['text', 'comment', 'comments', 'sentence']), 'text')
label_col_main = next((c for c in main_df.columns if str(c).lower() in ['label', 'labels', 'category', 'class']), 'label')
print(f"Identified columns in main dataset -> Text: '{text_col_main}', Label: '{label_col_main}'")
# Rename custom dataset columns to match main dataset
custom_df = custom_df.rename(columns={'text': text_col_main, 'label': label_col_main})
# Combine the dataframes
merged_df = pd.concat([main_df, custom_df], ignore_index=True)
# Remove any absolute duplicates just in case
merged_df = merged_df.drop_duplicates(subset=[text_col_main]).reset_index(drop=True)
print(f"New merged dataset size: {len(merged_df)}")
# Make a backup of the original just in case we need it
backup_path = data_dir / "training_data_telugu-hate_backup2.xlsx"
main_df.to_excel(backup_path, index=False)
print(f"Saved backup of original to {backup_path}")
# Overwrite the main dataset
merged_df.to_excel(main_dataset_file, index=False)
print(f"Successfully merged and saved updated dataset to {main_dataset_file}")
if __name__ == "__main__":
merge_datasets()